Outline of Process
NOTE: This markdown will map the process to create multiple dataset timepoints
Retrieve/receive PIDNs and compile PIDN list
Pull LAVA data required for timepoint function:
infoprocessingspeed
agingcogmeasures
neuropsychbedside
neuropsychcvlt
neuropsychmmse
udsneuropsychmoca
udsneuropsych
udscdr
Run timepoint function
#read in the lava files downloaded to the R Drive
lava_files <- list.files(str_c(datafolder, 'LavaQuery_Outputs/lava_pull'),
recursive = FALSE, all.files = FALSE, full.names = TRUE)%>%
str_subset('[~$]|\\.docx', negate = TRUE)%>%
str_subset('.csv|.xlsx')
#save all the lava sheets into a list of dataframes
lava <- read_all_files(lava_files, clean_names = TRUE)%>%
#set names to the sheet name without the suffix and '_mac_'
set_names(~str_remove_all(.x,c('_mac_|_x.*')))%>%
#convert dc_date to date datatype and rename it to DCDate if there is a dc_date column
map(function(x){
if(any(names(x) == 'dc_date')){
mutate(x,dc_date = as_date(dc_date))%>%
rename(DCDate=dc_date)}
else(x)})
# get rid of duplicate entries by prioritizing entry with least NAs and visit types
tp_instruments <- c("infoprocessingspeed",
"agingcogmeasures",
"neuropsychbedside",
"neuropsychcvlt",
"neuropsychmmse",
"udsneuropsychmoca",
"udsneuropsych",
"udscdr")
modify_at(lava, tp_instruments, clean_instrument, replace_neg_values = TRUE) -> lava
NOTE: If using data from the file used to extract PIDNs/PIDNs and Dates, store that data in environment
Store PIDNs/PIDNs and Dates in list
Rename dates to DCDate and convert to date data type
Create list of dfs
original_data_tps: Timepoints being generated here
original_data_no_tps: Only using pre-existing timepoints
#run this function to select file interactively
#some_file<-read_file()
read_csv(str_c(datafolder,'step_one/BrANCH_dataset_pidns.csv')) -> original_data_BrANCH
# specimens
read_excel(str_c(datafolder,'step_one/specimens_dataset_pidns_data.xlsx'))%>%
mutate(DCDate = as_date(SampleDate))%>%
select(PIDN, DCDate, SampleID) -> original_data_specimens
# sleep
read_excel(str_c(datafolder,'step_one/sleep_dataset_pidns_data.xlsx'))%>%
clean_names()%>%
rename(PIDN = pidn, DCDate = sleep_study_date)%>%
mutate(DCDate = as_date(DCDate)) -> original_data_sleep
# import fitbit data and store in environment
read_csv(str_c(datafolder,'Export-3-2-2023/final_fitbit_Export-3-2-2023.csv'))%>%
filter(PIDN > 50)%>%
ungroup() -> fitbit
# filter out ActAN data and any participant with project labeled excluded from Hillblom
fitbit%>%
filter(!(ProjectType == 'ActAN' & ProjectType != "EXCLUDED from Hillblom"))%>%
group_by(PIDN)%>%
mutate(Timepoint = zoo::index(PIDN)) -> original_data_fitbit
if(exists('some_file'))
lst(original_data_BrANCH, original_data_specimens, some_file) else
lst(original_data_BrANCH, original_data_specimens) -> original_data_tps
original_data_no_tps <- lst(original_data_sleep, original_data_fitbit)
timepoints_dfs <- map(original_data_tps, function(df){
df%>%
select(PIDN)%>%
left_join(bind_rows(pluck(lava,'infoprocessingspeed')%>%
mutate(TPhierarchy = ifelse(anim_yes_med > 0, 1, NA)),
pluck(lava,'agingcogmeasures')%>%
mutate(TPhierarchy = ifelse((dot_total >= 0)|
(df_emp_c >= 0)|
(df_swch_c >= 0), 2, NA)),
pluck(lava,'neuropsychbedside')%>%
mutate(TPhierarchy = ifelse((mt_time >= 0) |
(mod_rey >= 0) |
(d_corr >= 0) |
(bnt_tot >= 0), 3, NA)),
pluck(lava,'neuropsychcvlt')%>%
mutate(TPhierarchy = ifelse((corr10 >= 0) |
(cv2lfrc >= 0), 4, NA)),
pluck(lava,'neuropsychmmse')%>%
mutate(TPhierarchy = ifelse(mmse_tot >= 0, 5, NA)),
pluck(lava,'udsneuropsychmoca')%>%
mutate(TPhierarchy = ifelse((mocatots >= 0) |
(craftvrs >= 0) |
(crafturs >= 0) |
(traila >= 0) |
(minttots >= 0), 6, NA)),
pluck(lava,'udsneuropsych')%>%
mutate(TPhierarchy = ifelse((logimem >= 0) |
(traila >= 0), 7, NA)),
pluck(lava,'udscdr')%>%
mutate(TPhierarchy = ifelse(cdrglob >= 0, 8, NA)))%>%
distinct(PIDN, DCDate, TPhierarchy)%>%
filter(!is.na(TPhierarchy)), by = 'PIDN')%>%
arrange(PIDN, DCDate, TPhierarchy)})
timepoints_dfs <- map(timepoints_dfs, function(df) {
df%>%
group_by(PIDN)%>%
#create logical if data within 6 month period of a timepoint to prevent sparse tps
mutate(other_instruments_within_six_months = ifelse(
difftime(DCDate, lag(DCDate), units = "days") <= 183 & !is.na(lag(DCDate)) |
difftime(lead(DCDate),DCDate, units = "days") <= 183 & !is.na(lead(DCDate)), 1,0))%>%
#get list of instruments for each datapoint
group_by(PIDN, DCDate) %>%
mutate(instruments = lst(TPhierarchy),
cat_A = all(TPhierarchy == 1 | TPhierarchy == 2),
cat_B = all(TPhierarchy == 3 | TPhierarchy == 4 | TPhierarchy == 5),
cat_C = all(TPhierarchy == 6 | TPhierarchy == 7 | TPhierarchy == 8),
instrument_category = case_when(all(cat_A & !cat_B & !cat_C) ~ "a",
all(!cat_A & cat_B & !cat_C) ~ "b",
all(!cat_A & !cat_B & cat_C) ~ "c",
all(!cat_A & !cat_B & !cat_C) ~ "multiple"),
min_TPhierarchy = min(TPhierarchy))%>%
distinct(PIDN, DCDate, instruments, min_TPhierarchy,
other_instruments_within_six_months, instrument_category, cat_A, cat_B, cat_C)})
timepoints_dfs <- furrr::future_map(timepoints_dfs,
function(tp_df) {
purrr::reduce(list(tp_df,
find_data_within_range(df = tp_df,
max_num_days = 180,
column_name = timepoints_within_lag_or_lag_range,
direction = 'backward',
include_self = TRUE,
inclusive = TRUE,
lag_or_lead = 'lag'),
find_data_within_range(df = tp_df,
max_num_days = 180,
column_name = timepoints_within_six_months,
direction = 'both',
include_self = FALSE,
inclusive = TRUE),
find_data_within_range(df = tp_df,
max_num_days = 365,
min_num_days = 180,
column_name = timepoints_within_six_months_to_year,
direction = 'forward',
include_self = FALSE,
inclusive = TRUE),
find_data_within_range(df = tp_df,
max_num_days = 180,
column_name = timepoints_within_lead_or_lead_range,
direction = 'forward',
lag_or_lead = 'lead',
inclusive = TRUE,
include_self = TRUE)),
dplyr::left_join, by = c('PIDN', 'DCDate'))})
furrr::future_map(
timepoints_dfs,
function(df) {
df%>%
#detects current instruments in previous visit or any visit within 6 months from previous visit
mutate(any_instr_within_lag_or_six_months_from_lag =
#compare current instruments to data nested in 'timepoint_within_lag_range'
map2_lgl(instruments,
pluck(timepoints_within_lag_or_lag_range),
#ANY instrument at current tp found in the 'instruments' inside nested df
function(current_instruments, instruments_within_range)
any(current_instruments %in%
flatten_dbl(pluck(instruments_within_range,
'instruments', .default = lst())))),
#detects whether current tp category is found in next visit or six months from next visit
group_within_six_months_from_lead =
#selects current timepoint instrument category
map2_lgl(instrument_category,
#gathers dataframe nested in column
#'timepoints_within_lead_or_lead_range'
pluck(timepoints_within_lead_or_lead_range),
#if current timepoint has data from single instrument category, detect
#whether that instrument category collected at next visit or within
#months from next visit
function(current_category, dates_within_range)
any(str_subset(current_category, 'multiple', negate = TRUE) %in%
pluck(dates_within_range, 'instrument_category'))),
# detects whether any instruments at current timepoint are found within next visit
# or any visit within six months from next visit
any_instr_within_lead_or_six_months_from_lead =
#compare current instruments
map2_lgl(instruments,
#gather dataframe in nested in column
#'timepoints_within_lead_or_lead_range'
pluck(timepoints_within_lead_or_lead_range),
#compare current instruments to all 'instruments' within the next
#visit and visits within 180 days from next visit
function(current_instruments, dates_within_range)
any(current_instruments %in%
flatten_dbl(pluck(dates_within_range,
'instruments', .default = lst())))),
#detects whether any instrument at current tp are present in any visit 6mo to year away
any_instr_within_next_six_months_to_year =
#select current instruments to compare
map2_lgl(instruments,
#gather dataframe nested in column
#'timepoints_within_six_months_to_year'
pluck(timepoints_within_six_months_to_year),
#detect whether any of the current instruments are found within
#timepoints 180-365 days from current
function(current_instruments, dates_within_range)
any(current_instruments %in% flatten_dbl(pluck(dates_within_range,
'instruments',
.default = lst())))),
#detects whether any instruments at current timepoint are found within any visit
#within six months
any_instr_within_six_months =
#select current instrument
map2_lgl(instruments,
#gather df nested in column 'timepoints_within_six_months'
pluck(timepoints_within_six_months),
#detect whether any current instrument is found within 180 days in
#either direction from current visit
function(current_instruments, dates_within_range)
any(current_instruments %in% flatten_dbl(pluck(dates_within_range,
'instruments',
.default = lst())))),
#detects whether any timepoint within 180 days in either direction contains data
#from more than one category
any_timepoints_with_data_from_more_than_one_cat_within_six_months =
#gathers dataframe nested in column 'timepoints_within_six_months' and detects
#whether any of those visits collected data from more than one category
map_lgl(pluck(timepoints_within_six_months),
function(x) any(str_detect(pluck(x,'instrument_category'), 'multiple'))),
#detects whether the current visit has the highest priority compared to all visits
#within 180 days in either direction
highest_priority_within_six_months =
#compares current minimum timepoint hierarchy
map2_lgl(min_TPhierarchy,
#gathers dataframe nested in column 'timepoints_within_six_months'
pluck(timepoints_within_six_months),
#compares current timepoint hierarchy to all visit timepoint hierarchy
#within 180 days in either direction
function(current_min_tph, tps_within_range)
all(current_min_tph <= (pluck(tps_within_range,
'min_TPhierarchy')))))}) ->
timepoints_dfs
map(
timepoints_dfs,
function(df) {
df%>%
group_by(PIDN)%>%
#group by PIDN to compare participant data to itself
mutate(lag_timepoint_diff = as.numeric(difftime(DCDate, lag(DCDate) ,units = 'days')),
#days from prev visit
lead_timepoint_diff = as.numeric(difftime(lead(DCDate), DCDate, units = 'days')),
#days from next visit
lead_timepoint_within_range =
between(as.numeric(difftime(lead(DCDate), DCDate, units = 'days')), 180, 365),
#if next visit between 180-365 days
#using to check instruments that only have one category
lag_timepoint_within_range =
between(as.numeric(difftime(DCDate, lag(DCDate), units = 'days')), 180, 365),
#lag between 180-365---using to check instruments that only have one category
#if current timepoint is more than a year away from other timepoints,meets criteria
#of being isolated
isolated_timepoint = if_else(
#set to valid if first timepoint and next is more than a year away
(is.na(lag_timepoint_diff) & 365 < lead_timepoint_diff) |
#valid if more than 365 days from previous visit and last visit on file
(is.na(lead_timepoint_diff) & lag_timepoint_diff > 365) |
#valid if only participant timepoint
(is.na(lag_timepoint_diff) & is.na(lead_timepoint_diff)) |
#valid if no other visits within 365 days from current
(lag_timepoint_diff > 365 & lead_timepoint_diff > 365), 1 ,0),
#compare with priority of data within six months if next (x2) or previous (x2)
#timepoint more than a year away or next (x2) & previous (x2) don't exist
six_month_single_category_priority_far_neighbors = if_else(
#If current tp has instruments from different categories
instrument_category != 'multiple' &
other_instruments_within_six_months &
highest_priority_within_six_months &
!any_timepoints_with_data_from_more_than_one_cat_within_six_months &
(lag_timepoint_diff > 365 |
lead_timepoint_diff > 365 |
((as.integer(difftime(DCDate, lag(DCDate, n = 2), units = 'days')) > 365 |
is.na(lag(DCDate, n = 2))) &
(as.integer(difftime(lead(DCDate, n = 2), DCDate, units = 'days')) > 365 |
is.na(lead(DCDate, n = 2))))), 1, 0),
#if timepoint contains instruments from more than one instrument category and next
#or previous visit is 180 days or more from current or doesn't exist
multi_cat_neighbors_six_months_plus = if_else(
(lag_timepoint_diff >= 180 | lead_timepoint_diff >= 180 |
is.na(lead_timepoint_diff) | is.na(lag_timepoint_diff)) &
instrument_category == 'multiple', 1, 0),
#if takes priority over tps within six months and instrume was repeated in lag/lead
single_cat_priority_repeated_instr_lag_lead_range = if_else(
#If current visit has data from more than one category
(instrument_category != 'multiple' &
#no multicategory visits within six months in either direction
!any_timepoints_with_data_from_more_than_one_cat_within_six_months &
#highest priority over all visits within six months in either direction
highest_priority_within_six_months &
#next visit is within 180-365 days and any of current instruments are
#found in next visit or within 180 days after next visit
((lead_timepoint_within_range &
any_instr_within_lead_or_six_months_from_lead) |
#previous visit is within 180-365 days and any of current instruments
#are found in previous visit or within 180 days before previous visit
(lag_timepoint_within_range &
any_instr_within_lag_or_six_months_from_lag) |
#other timepoints within 6 months from current timepoint exist and
#current timepoint is last or first visit
(other_instruments_within_six_months &
(is.na(lead_timepoint_diff) |
is.na(lag_timepoint_diff))))) |
#no multicategory visits within six months in either direction
(instrument_category != 'multiple' &
#current visit is last visit and any of the current instruments are
#found in the previous visit or within 180 days before previous visit
((is.na(lead_timepoint_diff) &
any_instr_within_lag_or_six_months_from_lag) |
#current visit is first visit and any of the current instruments
#are found in the next visit or within 180 days after next visit
(is.na(lag_timepoint_diff) &
any_instr_within_lead_or_six_months_from_lead))), 1, 0),
#if timepoint contains an instrument that was collected within six months (forward
#or backward), criteria is met
six_month_neighbor_instr_repeated_lag_lead_range = if_else(
#catch all where instrument was repeated within 6 months from current date
#backward or forward
(lead_timepoint_diff < 180 &
any_instr_within_lead_or_six_months_from_lead) |
(lag_timepoint_diff < 180 &
any_instr_within_lag_or_six_months_from_lag), 1, 0),
#valid if collected data from more than one category if immediate neighbors are
#single category and next/previous visit from neighbor is more than 6 months away
stuck_between_single_cat = if_else(
instrument_category == 'multiple' &
lead(instrument_category) != 'multiple' &
lag(instrument_category) != 'multiple' &
(as.integer(difftime(DCDate, lag(DCDate, n = 2), units = 'days')) > 180 |
is.na(lag(DCDate, n = 2))) &
(as.integer(difftime(lead(DCDate, n = 2), DCDate, units = 'days')) > 180 |
is.na(lead(DCDate, n = 2))), 1 ,0),
#valid if any one of the criteria above met
valid = if_else(isolated_timepoint == 1 |
single_cat_priority_repeated_instr_lag_lead_range == 1 |
multi_cat_neighbors_six_months_plus == 1 |
six_month_neighbor_instr_repeated_lag_lead_range == 1 |
stuck_between_single_cat == 1 |
six_month_single_category_priority_far_neighbors == 1, 1, 0))%>%
filter(valid == 1)%>%
distinct(PIDN, DCDate)}) ->
timepoints_dfs
pluck(original_data_tps, 'original_data_specimens')%>%
bind_rows(pluck(timepoints_dfs, 'original_data_specimens'))%>%
arrange(PIDN, DCDate, SampleID)%>%
group_by(PIDN)%>%
filter(cumsum(!is.na(SampleID)) >= 1)%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
group_map(
~ mutate(.x, spec_tps =
lapply(DCDate, function(d)
.x[(which((365 >= (abs(difftime(DCDate, d, units = 'days')))) &
#makes sure current visit isn't included in own nested data
(0 != (abs(difftime(DCDate, d, units = 'days')))))),])),
.keep = TRUE)%>%
bind_rows()%>%
filter((!is.na(SampleID) | map_lgl(pluck(.,"spec_tps"),
function(x) all(is.na(pluck(x,'SampleID'))))))%>%
distinct(PIDN, DCDate, SampleID) ->
pluck(timepoints_dfs, 'original_data_specimens')
#creating list of pidns and dates to generate demographics later on
bind_rows(
if(exists('timepoints_dfs')) bind_rows(map(timepoints_dfs, ~select(.x, PIDN, DCDate))),
if(exists('original_data_no_tps'))
bind_rows(map(original_data_no_tps, ~select(.x, PIDN, DCDate))))%>%
distinct(PIDN, DCDate)%>%
arrange(PIDN, DCDate) ->
all_pidns_dates
# 1back, 2back, setshifting
custom_instruments <- c("1back", "2back", "setshifting")
instrum_list <- custom_instruments
custom_v_type_levels <- c('spatial cog', 'battery', 'a&c cog', 'tp1 cog')
modify_at(lava, custom_instruments,
function(instrument) {
clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
}) -> lava
# Medications, subject demographics, and neuroexam
custom_instruments <- c("udsmedicalconditions", "udsmedicationsnonprescrver1", "udsmedicationsprescrver1",
"udsmedicationsvitasupver1", "udssubjectdemo", "adrcneuroexam")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', 'ppg neurological exam', 'diagnostic evaluation')
modify_at(lava, custom_instruments,
function(instrument) {
clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
}) -> lava
# Bedsidescreen
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', 'ppg neurological exam', 'diagnostic evaluation')
clean_instrument(pluck(lava, 'bedsidescreen'),
v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)%>%
mutate(across(everything(), function(x) {replace(x, which(x < 0), NA)}))%>%
fill(everything(), .direction = "up")%>%
mutate(filled_with_instr_ids = str_c(instr_id, collapse = ", "))%>%
relocate(filled_with_instr_ids, .after = instr_id)%>%
distinct(PIDN, DCDate, .keep_all = TRUE) -> pluck(lava, 'bedsidescreen')
#cdr, npi, faq
custom_instruments <- c("cdr", "npi", "faq")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('nursing', 'adrc nursing', '6 month', '12 month', 'informant', 'ppg informant interview', 'diagnostic evaluation')
modify_at(lava, custom_instruments,
function(instrument) {
clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
}) -> lava
# uds physical
custom_instruments <- c("udsphysical", "hbudsphysical")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', '12 month', 'hb intermediary cog', 'ppg neurological exam', 'a&c cog', 'tp1 cog')
modify_at(lava, custom_instruments,
function(instrument) {
clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
}) -> lava
# Process the remaining instruments with default v_type and dc_type levels
remaining_instruments <- setdiff(names(lava), instrum_list)
lava <- modify_at(lava, remaining_instruments, clean_instrument, replace_neg_values = TRUE)
# Access the object from the global environment
#all_pidns_dates <- get("all_pidns_dates", envir = globalenv())
#joining uds subject demo to fill in blanks where possible
coalesce_join(pluck(lava, 'demographics')%>%
ungroup()%>%
select(PIDN, dob, deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple)%>%
mutate(across(c(hand, educ, primary_language, race_simple, span_or, y_span_or), function(x)
{replace(x, which(x<0 | x == 88 | x == 99), NA)})),
pluck(lava, 'udssubjectdemo')%>%
ungroup()%>%
select(PIDN, DCDate, handed, educ, primlang, primlanx, race, racex, racesec, racesecx, raceter, raceterx, hispanic, hispor, hisporx)%>%
#remove missing values
mutate(across(-PIDN, function(x) {replace(x, which(x < 0 | x == 88 | x == 99), NA)}))%>%
#filter out rows missing data
group_by(PIDN)%>%
arrange(rowSums(is.na(.)), desc(DCDate))%>%
select(-DCDate)%>%
distinct(PIDN, .keep_all = TRUE)%>%
mutate(hand = case_when(handed == 1 ~ 'LEFT',
handed == 2 ~ 'RIGHT',
handed == 3 ~ 'AMBIDEXTROUS'),
primary_language = case_when(primlang == 1 ~ 'English',
primlang == 2 ~ 'Spanish',
primlang == 3 ~ 'Mandarin',
primlang == 4 ~ 'Cantonese',
primlang == 5 ~ 'Russian',
primlang == 6 ~ 'Japanese',
primlang == 8 ~ primlanx),
race_simple = case_when(race == 1 & is.na(racesec) ~ 1,
race == 2 & is.na(racesec) ~ 2),
span_or = case_when(hispanic == 0 ~ 2,
hispanic == 1 ~ 1),
y_span_or = case_when(hispor == 3 ~ 5,
hispor == 2 ~ 4,
hispor == 5 ~ 3,
hispor == 6 ~ 2))%>%
select(PIDN, hand, primary_language, race, span_or, y_span_or), by = 'PIDN')%>%
select(PIDN, dob, deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple) -> demographics_df
# recode demographics for the pidns in environment
demographics_df%>%
left_join(all_pidns_dates,
by = 'PIDN')%>%
select(PIDN, dob, matches('\\bRFM\\b'), deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple)%>%
mutate(span_or_text = recode(span_or, '1' = 'Yes', '2' = 'No'),
y_span_or_text = recode(y_span_or,
'1' = 'North American',
'2' = 'South American',
'3' = 'Central American',
'4' = 'Puerto Rican',
'5' = 'Cuban',
'6' = 'Haitian',
'7' = 'Other Spanish/Hispanic/Latino'),
race_simple_text = recode(race_simple,
'1' = 'White',
'2' = 'Black/AfricanAmerican',
'3' = 'Asian Indian',
'4' = 'Cambodian',
'5' = 'Chinese',
'6' = 'Filipino',
'7' = 'Japanese',
'8' = 'Hmong',
'9' = 'Korean',
'10' = 'Laotion',
'11' = 'Vietnamese',
'12' = 'Other Asian',
'13' = 'Native Hawaiian',
'14' = 'Guamamian',
'15' = 'Somoan',
'16' = 'Other Pacific Islander',
'17' = 'American Indian',
'18' = 'Other Race'))%>%
arrange(PIDN, rowSums(is.na(.)))%>%
distinct(PIDN, .keep_all = TRUE) -> demographics_curated_df
lava$brainhealthassessment%>%
select(PIDN, DCDate, instr_type, v_type, digitsymbol_corr, digitsymbol_err, favorites_recall1,
favorites_recall2, favorites_delay, favorites_total, favorites_recog_corr, favorites_recog_sme,
favorites_recog_err, par_line_score, par_line_catchtrial, lo_score, lo_catchtrial)%>%
mutate(across(c(digitsymbol_corr:lo_catchtrial), ~ifelse(. %in% c(-5), NA, .))) ->
brainhealthassessment
lava$cdr%>%
select(PIDN, DCDate, instr_type, cdr_tot, box_score)%>%
mutate(across(c(cdr_tot, box_score), function(x) {replace(x, which(x < 0), NA)})) ->
cdr
lava$hbclinicallabs_myelinucddra%>%
select(PIDN, DCDate, instr_type, glucose_mg_d_l, hemoglobin_a1c_percent, insulin_u_u_ml, total_cholesterol_mg_dl,
triglycerides, hdl_mg_d_l, ldl_mg_d_l, hs_crp_mg_l, cholesterol_hdl_ratio, non_hdl_cholesterol, homa_ir) ->
clinical_labs
lava$cpt%>%
select(PIDN, DCDate, total_corr, total_errors) ->
cpt
lava$diagnosis%>%
select(PIDN, DCDate, clin_syn_best_est, clin_syn_sec_est, res_dx_a, res_dx_b)%>%
na_if(-8) ->
diagnosis
lava$diagnosis_latest%>%
ungroup()%>%
select(PIDN, clin_syn_best_est, clin_syn_sec_est, res_dx_a, res_dx_b)%>%
na_if(-8)%>%
arrange(PIDN, rowSums(is.na(.)))%>%
distinct(PIDN, .keep_all = TRUE)->
diagnosis_latest
lava$earlydevhistory%>%
select(PIDN, DCDate, instr_type, edh_dys, edh_mot, edh_lan, edh_att, edh_imp, edh_anti, edh_shy,
edh_dep, edh_anx, edh_obs, edh_oth, edh_read, edh_spell, edh_math, edh_f_lan, edh_mech,
edh_sport, edh_music, edh_art, edh_hyp)%>%
mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
filter(rowSums(across(edh_dys:edh_hyp, ~ !is.na(.))) > 0)%>%
na_if(9) ->
earlydevhistory
lava$faq%>%
select(PIDN, DCDate, faq_tot)%>%
mutate(across(c(faq_tot), function(x) {replace(x, which(x < 0), NA)}))%>%
mutate(across(c(faq_tot), function(x) {replace(x, which(x > 30), NA)}))%>%
drop_na() ->
faq
lava$fishermanstory%>%
select(PIDN, DCDate, instr_type, total30min, totalweek)%>%
mutate(across(c(total30min, totalweek), function(x) {replace(x, which(x < 0), NA)})) ->
fishermanstory
bind_rows(
lava$udsmedicationsprescrver1%>%
select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
mutate(across(c(pmeds:pmtpfu),
~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))),
lava$udsmedicationsdetailsver2%>%
pivot_wider(id_cols = c('PIDN', 'DCDate','instr_type'),
names_from = 'drugid',
names_glue = "drugid_{drugid}",
values_from = c('not_listed'),
values_fn = ~ifelse(all(is.na(.x)), paste(is.na(.x)), paste(lst(.x)))))%>%
rename(drugid_NA = drugid_99999)%>%
mutate(across(-c(instr_type, drugid_NA), ~as.logical(.)))%>%
relocate(PIDN, DCDate, instr_type, gtools::mixedsort(names(.))) ->
medications
lava$adrcneuroexam%>%
select(PIDN, DCDate, instr_type, updrs, psp_oc, psp_limb, psp_gait)%>%
na_if(-5) ->
adrcneuroexam
#npi -- joining the two npi instruments
full_join(lava$udsnpi%>%
rename_with(~paste('uds',., sep = '_'), -c('PIDN':'instr_id'))%>%
select(PIDN, DCDate, uds_npiqinf, uds_npiqinfx, uds_del, uds_delsev, uds_hall, uds_hallsev,
uds_agit, uds_agitsev, uds_depd, uds_depdsev, uds_anx, uds_anxsev, uds_elat,
uds_elatsev, uds_apa, uds_apasev, uds_disn, uds_disnsev, uds_irr, uds_irrsev,
uds_mot, uds_motsev, uds_nite, uds_nitesev, uds_app, uds_appsev),
lava$npi%>%
select(PIDN, DCDate, delusn, dngr, steal, affair, vistrs, claim,
home, abandon, t_vfigs, del_oth, del_freq, del_sev, del_dis, del_totl,
hlcntns, voices, talk, see, smell, touch, taste, hal_oth, hal_freq,
hal_sev, hal_dis, hal_totl, agitate, resist, stubborn, help, behavior,
curse, throw, hit, ag_oth, ag_freq, ag_sev, ag_dis, ag_totl, dprssn,
tearful, sad, failure, punish, future, burden, death, dep_oth, dep_freq,
dep_sev, dep_dis, dep_totl, anxiety, worry, tense, sighing, nervous,
avoid, upset, anx_oth, anx_freq, anx_sev, anx_dis, anx_totl, euphoria,
happy, humor, giggle, jokes, pranks, truth, eup_oth, eup_freq, eup_sev,
eup_dis, eup_totl, apathy, spntns, convrs, emotion, chores, intrst,
friends, enthuse, apth_oth, apth_freq, apth_sev, apth_dis, apth_totl,
disinhibition, impulsiv, stranger, hurt, crude, openly, hug, dis_oth,
dis_freq, dis_sev, dis_dis, dis_totl, irritble, temper, mood, anger,
coping, cranky, difficult, irr_oth, irr_freq, irr_sev, irr_dis, irr_totl,
motor, pace, rummage, clothing, habits, repetitive, fidget, mot_oth,
mot_freq, mot_sev, mot_dis, mot_totl, sleep, fall_asleep, night, wander,
awaken, start, early, day, sleep_oth, sle_frq, sle_sev, sle_dis, sle_totl,
eat, appetite, increase, wght_loss, wght_gain, change, food, food_type,
eat_oth, eat_freq, eat_sev, eat_dis, eat_totl, total, dstrs_tot, npi_q), by = c('PIDN','DCDate'))%>%
mutate(across(c(uds_npiqinf:npi_q), function(x) {replace(x, which(x < 0), NA)}))%>%
mutate(across(-c(del_totl, hal_totl, ag_totl, dep_totl, anx_totl, dstrs_tot, total, eat_totl, sle_totl, mot_totl, irr_totl, dis_totl, apth_totl, eup_totl),
~na_if(.x, 9)))%>%
arrange(PIDN, DCDate) ->
npi
#uds otc meds
lava$udsmedicationsnonprescrver1%>%
select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
mutate(across(c(nmeds:nmtfu), ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
otc_meds
lava$patternseparation%>%
mutate(LDI = p_sep_sim_lure - p_sep_sim_foil)%>%
select(PIDN, DCDate, instr_type, p_sep_sim_lure, p_sep_sim_foil, LDI) ->
patternseparation
#uds vitamins data
lava$udsmedicationsvitasupver1%>%
select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
mutate(across(c(vitasups:vstfu), ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
supplements
lava$udshealthhistory%>%
select(PIDN, DCDate, instr_type, tobac30, tobac100,smokyrs,packsper, quitsmok, alcoccas, alcfreq, cvhatt, hattmult,
hattyear, cvafib, cvangio, cvbypass,cvpace, cvpacdef, cvchf, cvangina, cvhvalve, cvothr, cvothrx, cbstroke,
strok1yr, strok2yr,strok3yr, strok4yr, strok5yr, strok6yr, strokmul, strokyr, cbtia, tia1yr, tia2yr, tia3yr,
tia4yr, tia5yr, tia6yr, tiamult, tiayear, cbothr, cbothrx, pd, pdyr, pdothr,pdothryr, seizures, traumbrf,
traumext, traumchr, tbi, tbibrief, tbiexten, tbiwolos, tbiyear,ncothr, ncothrx, hyperten, hypercho, diabetes,
diabtype, b12def, thyroid, arthrit, arthtype,arthtypx, arthupex, arthloex, arthspin, arthunk, incontu, incontf,
apnea, rbd, insomn,othsleep, othsleex, alcohol, abusothr, abusx, ptsd, bipolar, schiz, dep2yrs, depothr, anxiety,
ocd, npsydev, psycdis, psycdisx)%>%
mutate(across(c(tobac30:psycdisx),
~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 9, 88, 95, 96, 97, 98, 99, 888, 8888, 995, 996, 997, 998, 999, 9999), NA, .))) ->
health_history
lava$udsmedicalconditions%>%
select(PIDN,DCDate, instr_type, cancer:othcondx, form_ver)%>%
mutate(across(c(cancer:form_ver), ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 8, 9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
med_conditions
bind_rows(lava$udsphysical,
lava$hbudsphysical)%>%
select(PIDN, DCDate, instr_type, height, weight, bpsys, bpdias, hrate)%>%
mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
mutate(across(where(is.numeric), ~ifelse(. %in% c(777, 88, 88.8, 888, 99.9, 999), NA, .)))%>%
clean_instrument()->
physical
qualtrics_data <- readRDS(str_c(datafolder,'qualtrics_data_2023_03_14.RData'))
#/Volumes/macdata/projects/hillblom/Datasets/Dataset_Summer_2022qualtrics_data_2022_11_16.RData
qualtrics_data$bu_subset_data%>%
na_if(-99) ->
bu_subset_data
qualtrics_data$osu_tbi_locpta_subset_data ->
osu_tbi_locpta_subset_data
qualtrics_data$qualtrics_hap_perceived_stress_scale%>%
mutate(instr_type = 'PSS-10')%>%
select(PIDN, DCDate, source, instr_type, PSSTotal)%>%
prioritize_qualtrics_data() ->
pss
qualtrics_data$qualtrics_hap_grit_scale%>%
bind_rows(lava$grit%>%
filter(dc_status == 'Complete')%>%
mutate(source = 'lava'))%>%
mutate(instr_type = 'GRIT')%>%
select(PIDN, DCDate, instr_type, source, grit_tot, grit_consistence_tot, grit_persev_eff_tot, grit_score, grit_consistence_score, grit_persev_eff_score)%>%
prioritize_qualtrics_data() ->
grit
qualtrics_data$qualtrics_edinburgh_handedness_inventory%>%
bind_rows(lava$edinburghhandedness%>%
#replace negative values with NA
mutate(source = 'lava',
across(everything(), function(x){ replace(x, which(x < 0), NA) }))%>%
#filter out 'scheduled'
filter(dc_status == 'Complete'))%>%
select(PIDN, DCDate, source, instr_type, ehi1, ehi2, ehi3, ehi4, ehi5, ehi6, ehi7, ehi8, ehi9, ehi10, ehi11, ehi12, ehi_tot, natural_l, natural_r)%>%
mutate(across(c(natural_l, natural_r), ~na_if(.x, 9)))%>%
mutate(instr_type = 'Edinburgh Handedness')%>%
prioritize_qualtrics_data() ->
edinburgh_handedness
qualtrics_data$qualtrics_hap_diet%>%
bind_rows(lava$minddiet%>%
filter(dc_status == 'Complete')%>%
clean_names()%>%
rename(DCDate = dc_date,
PIDN = pidn)%>%
mutate(PIDN = as.numeric(PIDN),
DCDate = as_date(DCDate),
source = 'lava'))%>%
#see if there are any duplicates for PIDN/DCDate combos
mutate(instr_type = 'MIND Diet')%>%
select(PIDN, DCDate, instr_type, source, dietq_mindscore)%>%
prioritize_qualtrics_data() ->
diet
qualtrics_data$qualtrics_gad%>%
bind_rows(lava$gad7%>%
mutate(source = 'lava'))%>%
group_by(PIDN,DCDate)%>%
#all v_type === SOICAL FX PSYCHIATRY, self_other == Self, instr_type == GAD7 so not including those columns
select(PIDN, DCDate, source, gad7_1, gad7_2, gad7_3, gad7_4, gad7_5, gad7_6, gad7_7, gad7_tot)%>%
mutate(across(c(gad7_tot), function(x) {replace(x, which(x < 0), NA)}))%>%
mutate(instr_type = 'GAD-7')%>%
select(PIDN, DCDate, instr_type, gad7_tot)%>%
prioritize_qualtrics_data() ->
gad
qualtrics_data$qualtrics_hap_maas_mindfulness%>%
mutate(instr_type = 'MAAS',
maas_mindfulnesstot = sum(across(maas_notawareemotion:maas_snackwithoutattn)))%>%
select(PIDN, DCDate, instr_type, maas_mindfulnesstot, maas_mindfulnessscore)%>%
prioritize_qualtrics_data() ->
maas_mindfulness
qualtrics_data$qualtrics_hap_physical_activity_scale%>%
select(PIDN, DCDate, source, pase_pase_total)%>%
prioritize_qualtrics_data() ->
pase
qualtrics_data$qualtrics_hap_cognitive_activity_scale%>%
bind_rows(lava$cognitiveactivity%>%
filter(dc_status == 'Complete')%>%
select(PIDN,DCDate,cas_game6:last_col())%>%
mutate(across(everything(), function(x){replace(x, which(x<0), NA)}),
source = 'lava'))%>%
ungroup()%>%
rowwise()%>%
mutate(across(c(cas_game6:cas_games_yr),
function(column_value) {ifelse(column_value == 9, 0, column_value)}),
cas_tot_pts = sum(across(
c(cas_game6,
cas_read6,
cas_story6,
cas_lib12,
cas_paper12,
cas_mags12,
cas_read12,
cas_write12,
cas_game12,
cas_lib18,
cas_paper18,
cas_mags18,
cas_read18,
cas_write18,
cas_game18,
cas_paper40,
cas_mags40,
cas_books40,
cas_write40,
cas_games40,
cas_paper_yr,
cas_mags_yr,
cas_books_yr,
cas_write_yr,
cas_games_yr),
function(column_value) { ifelse(column_value != 0, 6 - column_value, column_value)}), na.rm = FALSE),
cas_score = as.integer(cas_tot_pts/sum(across(
c(cas_game6,
cas_read6,
cas_story6,
cas_lib12,
cas_paper12,
cas_mags12,
cas_read12,
cas_write12,
cas_game12,
cas_lib18,
cas_paper18,
cas_mags18,
cas_read18,
cas_write18,
cas_game18,
cas_paper40,
cas_mags40,
cas_books40,
cas_write40,
cas_games40,
cas_paper_yr,
cas_mags_yr,
cas_books_yr,
cas_write_yr,
cas_games_yr),
function(column_value) { column_value != 0 }), na.rm = FALSE)),
lca_reading = sum(across(
c(lca_paper,
lca_mags,
lca_books,
lca_bks_yr,
lca_lib_crd,
lca_lib_crd,
lca_bkcase,
lca_bks_cnt)), na.rm = FALSE),
lca_total = sum(across(
c(lca_radio,
lca_ra_news,
lca_tv,
lca_tv_news,
lca_paper,
lca_mags,
lca_books,
lca_bks_yr,
lca_lib_crd,
lca_bkcase,
lca_bks_cnt)), na.rm = FALSE))%>%
select(PIDN, DCDate, source, cas_tot_pts, cas_score, lca_reading, lca_total)%>%
prioritize_qualtrics_data() ->
cogntive_activity_scale
qualtrics_data$qualtrics_hap_everyday_cognition_patient_self_report_form%>%
bind_rows(lava$everydaycogself%>%
arrange(PIDN,DCDate)%>%
select(PIDN,DCDate,ec_score:last_col())%>%
mutate(source = 'lava'))%>%
#select only cols that are in all sources
select(PIDN, DCDate, source, ec_concern, ec_mem_score:ec_score)%>%
#arrange by rows with most data if duplicates
arrange(PIDN, DCDate, rowSums(is.na(.)))%>% #replace negative values with NA
mutate(across(everything(), function(x){replace(x, which(x<0), NA)}))%>%
select(PIDN, DCDate, ec_concern, ec_mem_score, ec_lang_score, ec_vis_score, ec_plan_score, ec_org_score, ec_attn_score, ec_other_score, ec_score)%>%
mutate(instr_type = 'Everyday Cog Self')%>%
relocate(instr_type, .after = DCDate)%>%
prioritize_qualtrics_data() -> everydaycogself
qualtrics_data$qualtrics_sex_and_reproductive_health_questions%>%
#add check to replace invalid responses with NA
mutate(sexQ_ageatlastperiod = ifelse(sexQ_ageatlastperiod < sexQ_children_ageatfirstchild |
sexQ_ageatlastperiod < sexQ_ovaryremoval_age, NA, sexQ_ageatlastperiod),
sexQ_pregnancies_secondtrimester = ifelse(sexQ_pregnancies_secondtrimester > 40, NA, sexQ_pregnancies_secondtrimester))%>%
prioritize_qualtrics_data() ->
sex_and_reproductive_health
qualtrics_data$qualtrics_hap_champs%>%
#filter out lava data since it doesn't match--don't have dict for MET items
filter(source != 'lava')%>%
select(PIDN, DCDate, version, DCDate_physical, weight, date_diff_physical, TotalNum_Soc,
TotalNum_Cog, TotalNum_Phys, TotalNum_ALL, WklyHrs_Soc, WklyHrs_Cog,
WklyHrs_Phys, WklyHrs_ALL, New_Soc, New_Phys, New_Cog, New_ALL, WklyCount_Soc, WklyCount_Phys,
WklyCount_Cog, WklyCount_ALL, METWeighted_Q13, METWeighted_Q15, METWeighted_Q20, METWeighted_Q21,
METWeighted_Q24, METWeighted_Q25, METWeighted_Q26, METWeighted_Q27, METWeighted_Q28, METWeighted_Q29,
METWeighted_Q30, METWeighted_Q31, METWeighted_Q32, METWeighted_Q33, METWeighted_Q34, METWeighted_Q35,
METWeighted_Q36, METWeighted_Q37, METWeighted_Q38, METWeightcompleted, TotalCaloricExpenditure)%>%
prioritize_qualtrics_data() ->
champs
qualtrics_data <- map(qualtrics_data, ~ prioritize_qualtrics_data(.x))
virtual_bedside <- read_csv(str_c(datafolder,'HB Remote Bedside Data/HAP+Virtual+Bedside_June+10,+2022_17.10.csv'), col_select = 18:96, skip = 1)
#virtual bedside
virtual_bedside%>%
set_names(lst('PIDN','CenterID','DCDate','MOCACUBE','MOCACLOCK','MOCALANGUAGE','CV2T1C','CV2T2C','CV2T3C','CV2T4C','CV2T5C',
'CV2TbC','CV2SFRC','CV2SDCC','CV2LFRC','CV2LDCC','CV2T1I','CV2T2I','CV2T3I','CV2T4I','CV2T5I','CV2TbI','CV2SFRI',
'CV2SDCI','CV2LFRI','CV2LDCI','CV2Hit','CV2bR','CV2bU','CV2NP','CV2NU','Rey10m','ReyRecg','WRATWrd','WRATLet',
'WRATIr','Syntax','Verbal','repeat','repeat5','minttots','minttotw','mintscng','mintscnc','mintpcng','mintpcnc',
'LngPVrb','LngPDes','LngPAni','LngPIna','modRey','numbLoc','Calc','CATSFMTot','CATSAMTot','StrpCNCor','StrpCNErr',
'StrpCor','StrpErr','StrpSCE','Dcorr','Dreps','DRuleV','behav1','behav2','behav3','behav4','behav5','behav6',
'behav7','behav8','behav9','behav10','researchStatus','qualityIssueLogical','qualityIssue1','qualityIssue2',
'qualityIssue3','qualityNotes'))%>%
mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
mutate(DCDate = as_date(DCDate, format = '%m/%d/%Y'))%>%
select(-c(CenterID, qualityIssue1, qualityIssue2, qualityIssue3, qualityIssueLogical, qualityNotes)) ->
virtual_bedside
# join synced and unsynced tabcat data and seperate into list of dfs since multiple dates for a data collection period
tabcat_unsynced <- read_excel(str_c(datafolder,'TabCAT Data/EXAMINER Unsynced Data.xlsx'), guess_max = 1048576)
tabcat<- read_excel(str_c(datafolder,"TabCAT Data/MACResearch_TabCAT_2023_01_27.xlsx"), sheet = 'combo', col_types = 'text')
# join synced and unsynced tabcat data and seperate into list of dfs since multiple dates for a data collection period
tabcat_unsynced%>%
rename(DCDate = `Visit Date`,
Flanker_TotalScore = "Flanker Total Score",
RapidNaming_AvgReactionTime = "Rapid Naming Average Time",
RapidNaming_TotalSkipped = "Rapid Naming Total Skipped",
RapidNaming_TotalIncorrect = "Rapid Naming Total Incorrect",
RapidNaming_TotalCorrect = "Rapid Naming Total Correct",
RunningDots_5Dot_PercentCorrect = "Running Dots 5 Dot % Correct",
RunningDots_4Dot_PercentCorrect = "Running Dots 4 Dot % Correct",
RunningDots_3Dot_PercentCorrect = "Running Dots 3 Dot % Correct",
RunningDots_2Dot_PercentCorrect = "Running Dots 2 Dot % Correct",
SetShifting_TotalScore = "Set Shifting Total Score",
DotCounting_TotalScore = "Dot Counting Total Score")%>%
mutate(DCDate = as_date(DCDate),
Flanker_TotalScore = as.numeric(Flanker_TotalScore))->
tabcat_unsynced
tabcat%>%
mutate(across(contains('date'), ~ as.Date(as.numeric(.x), origin = "1899-12-30")))%>%
rename(PIDN = Examinee_Identifier, DCDate = Encounter_StartDate)%>%
mutate(across(where(is.character), function(x) {replace(x, which(x == 'NA' | x == 'N/A' | x == 'NaN' | x == 'null'), NA)}),
RapidNaming_AvgReactionTime = parse_number(str_remove(str_replace(RapidNaming_AvgReactionTime, ',', '.'), 'ms')))%>%
type_convert(trim_ws = TRUE)%>%
bind_rows(tabcat_unsynced) ->
tabcat
#getting names of different instruments to extract into their own dfs and name the list item with the instrument
tabcat%>%
select(!PIDN & !DCDate & !contains("Encounter") & !contains("Informant") & !contains('Examinee') & !contains('fav'))%>%
colnames()%>%
str_extract("[^_]+")%>%
unique()%>%
str_c('_')%>%
str_replace('Composite_', 'BHA')%>%
append('fav') ->
tabcat_instruments
# set the extracted dfs in a list of dfs with the name of the insturment the name of the df
map(tabcat_instruments,
~ select(tabcat, c(PIDN, DCDate, contains(.x)))%>%
filter(!if_all(-c(PIDN, DCDate), is.na)))%>%
set_names(str_c('tabcat.', tabcat_instruments)%>%
make_clean_names()) ->
tabcat
tabcat$tabcat_bha -> tabcat_bha
tabcat$tabcat_animal_fluency -> tabcat_animal_fluency
#consolidating dot counting data
full_join(lava$dotcounting%>%
filter(!is.na(dot_counting_total) & dot_counting_total >= 0)%>%
select(PIDN, DCDate, dot_counting_total)%>%
rename(dot_counting_lava = dot_counting_total),
tabcat$tabcat_dot_counting%>%
filter(!is.na(DotCounting_TotalScore))%>%
rename(dot_counting_total_tabcat = DotCounting_TotalScore), by = c('PIDN', 'DCDate'))%>%
relocate(dot_counting_lava, dot_counting_total_tabcat, .after = DCDate) ->
dot_counting
#
tabcat$tabcat_flanker%>%
mutate(across(c(Flanker_TaskDuration, Flanker_PTSet1_TotalCorrect, Flanker_PTSet2_TotalCorrect, Flanker_TotalScore,
Flanker_Correct_Total, Flanker_Correct_MedianRT, Flanker_Correct_StDevRT, Flanker_CongrCorrect_Total, Flanker_CongrCorrect_MedianRT,
Flanker_IncongrCorrect_Total, Flanker_IncongrCorrect_MedianRT, Flanker_TotalScore_Z),
~ifelse(. == 'null'| . == 'N/A', NA, as.numeric(as.character(.)))),
Flanker_PracticeTrialSuccess = as.logical(case_when(Flanker_PracticeTrialSuccess == '1' ~ 'TRUE',
Flanker_PracticeTrialSuccess == '0' ~ 'FALSE',
TRUE ~ as.character(Flanker_PracticeTrialSuccess)))) ->
tabcat_flanker
tabcat$tabcat_ll -> tabcat_ll
tabcat$tabcat_lo -> tabcat_lo
tabcat$tabcat_match%>%
select(-c(Match_Correct_0to15:Match_Incorrect_105to120)) -> tabcat_match
tabcat$tabcat_rapid_naming%>%
select(-c(RapidNaming_T1_Score:RapidNaming_T60_VerbatimResponse)) -> tabcat_rapid_naming
tabcat$tabcat_running_dots ->
tabcat_running_dots
tabcat$tabcat_set_shifting -> tabcat_set_shifting
tabcat$tabcat_fav%>%
select(-c(FavDelay_TaskVersion: FavDelay_animal4Score, FavRec_T1:FavRec_T24)) -> tabcat_fav
#Torie put together
imaging_mci <- read_excel(str_c(datafolder,'MRI Datasets/MCI_dataset_03-2022.xlsx'), .name_repair = 'minimal')
#Corrina's imaging dataset
imaging_file <- c(str_c(datafolder,'/MRI Datasets/MRI_Dataset_Summer2020_updated.xlsm'))
old_imaging <- read_all_files(imaging_file, prefix = 'imaging', 0, clean_names = TRUE)
#Data uploaded by Yann to box folder
#lists files in the MRI Datasets/box directory
mri_box_files = list.files(str_c(datafolder,'MRI Datasets/box'), include.dirs = TRUE, recursive = TRUE, full.names = TRUE)
mri_box_data <- read_all_files(mri_box_files, prefix = 'imaging', clean_names = FALSE)
# for wmh, separate out prisma and trio into two columns-also separated out sfva scanned wmh values into their own column. prioritizing older data, new data, then torie/mci data
# for pasl, no changes needed since no overlapping data between old and new
# for pcasl, only need to create new vars. prioritizing older data
# for dti v2 and v6, only including old data until new data reprocessed
# for t1, combine old and new. need guidance on vars that don't overlap. prioritizing older data. still some questions about larger differences e.g., brain_stem
# hold off on functional connectivity for now
#Torie put together
imaging_mci%>%
subset(select = which(!duplicated(names(.))))%>%
select(c(PIDN, ScannerID:last_col()))%>%
rename(DCDate = Date)%>%
relocate(DCDate, .after = PIDN)%>%
mutate(DCDate = as_date(DCDate)) ->
imaging_mci
#Data from R Drive
old_imaging%>%
set_names(names(.)%>%
str_remove_all('_mri.*'))%>%
map(.,function(x){
if(any(names(x) == 'mri_date')){
x%>%
rename(DCDate = mri_date)%>%
mutate(DCDate = as_date(DCDate))}})%>%
keep(~'DCDate' %in% names(.x)) ->
old_imaging
names(old_imaging) <- c("imaging_t1", "imaging_wmh", "imaging_dti_v2", "imaging_dti_v6", "imaging_pcasl", "imaging_pasl", "imaging_fc_trio", "imaging_fc_prisma")
mri_box_data%>%
map(~rename(.x, DCDate = matches('\\bdate\\b'))) ->
mri_box_data
#redoing this to have all data modified
mri_box_data_modified <- mri_box_data
mri_box_data_modified$imaging_gm_all_production <- NULL
mri_box_data_modified$imaging_gm_all_production_2 <- NULL
mri_box_data_modified$imaging_gm_all_production <-
bind_rows(mri_box_data$imaging_gm_all_production,
mri_box_data$imaging_gm_all_production_2%>%
mutate(PIDN = as.numeric(PIDN)))%>%
arrange(PIDN)#%>%
#filter(Label == 'Sum')
#schaefer grey matter
mri_box_data_modified$imaging_gm_schaefer <- NULL
mri_box_data_modified$imaging_gm_schaefer_2 <- NULL
mri_box_data_modified$imaging_gm_schaefer <-
bind_rows(mri_box_data$imaging_gm_schaefer,
mri_box_data$imaging_gm_schaefer_2%>%
mutate(PIDN = as.numeric(PIDN)))%>%
arrange(PIDN)%>%
filter(Label == 'Sum')
#qsm- not sure which to take from this. think average since dti..?
mri_box_data_modified$imaging_qsm_template <- NULL
mri_box_data_modified$imaging_qsm_template_2 <- NULL
mri_box_data_modified$imaging_qsm_template <-
bind_rows(mri_box_data$imaging_qsm_template,
mri_box_data$imaging_qsm_template_2%>%
mutate(PIDN = as.numeric(PIDN)))%>%
arrange(PIDN)%>%
filter(Label == 'Mean')
mri_box_data_modified_clean_names <- mri_box_data_modified%>%map(~clean_names(.x))
bind_rows(
#yann said to take the wmh load from this?
mri_box_data_modified_clean_names$imaging_summary_trio%>%
mutate(source = 'new_trio_summary',
scanner = 'trio')%>%
rename(wmh_mm3_trio = wmh_mm3),
#yann said to take the wmh load from this?
mri_box_data_modified_clean_names$imaging_summary_prisma%>%
mutate(source = 'new_prisma_summary',
scanner = 'prisma')%>%
rename(wmh_mm3_prisma = wmh_mm3,
wmh_qc = qc_wmh,
flair_qc = qc_flair),
#yann said to take the wmh load from this?
mri_box_data_modified_clean_names$imaging_summary%>%
mutate(source = 'new_prisma_summary',
scanner = 'prisma')%>%
rename(wmh_mm3_prisma = wmh_mm3),
old_imaging$imaging_wmh%>%
mutate(source = 'workbook_wmh',
scanner = str_extract(wm_hpipeline, "(?<=_)[^_]+$"))%>%
mutate(wmh_mm3_prisma = ifelse(scanner == 'prisma', wmh_mm3, NA),
wmh_mm3_trio = ifelse(scanner == 'trio', wmh_mm3, NA),
scanner = ifelse(scanner == 'supervised' & scanner_id == 'SFVA 4T MRI', 'SFVA 4T MRI', scanner),
wmh_mm3_SFVA = ifelse(scanner_id == 'SFVA 4T MRI', wmh_mm3, NA))%>%
clean_names(),
imaging_mci%>%
mutate(source = 'mci_dataset')%>%
rename(csf_mm3 = `CSV(mm3)`,
wmh_mm3 = `WMH(mm3)`,
scanner = Scanner)%>%
mutate(scanner = tolower(scanner),
wmh_mm3_prisma = ifelse(scanner == 'prisma', wmh_mm3, NA),
wmh_mm3_trio = ifelse(scanner == 'trio', wmh_mm3, NA))%>%
clean_names())%>%
mutate(scanner = tolower(scanner),
#setting data to NA if data didn't pass quality control
across(c(gm_mm3:icv_mm3),
~ifelse((!is.na(wmh_qc) & wmh_qc == 0) |
(!is.na(flair_qc) & flair_qc == 0) |
(!is.na(qc) & qc == 0) |
(!is.na(proc_status_wmh) & proc_status_wmh == 0), NA, .)))%>%
mutate(log_wmh = log(wmh_mm3))%>%
arrange(pidn, dc_date, factor(source, levels = c('workbook_wmh', 'new_prisma_summary', 'new_trio_summary', 'mci_dataset')))%>%
rename(PIDN = pidn, DCDate = dc_date, scanner_inferred = scanner)%>%
relocate(source, scanner_inferred, .after = DCDate)%>%
select(PIDN, DCDate, source, source_id, wmh_mm3_trio, wmh_mm3_prisma, wmh_mm3_sfva)%>%
filter(!is.na(wmh_mm3_trio) | !is.na(wmh_mm3_prisma) | !is.na(wmh_mm3_sfva))%>%
distinct(PIDN, DCDate, .keep_all = TRUE) ->
imaging_wmh
#pasl joining
#new data does not have lh_ADROI.paslxsec rh_ADROI.paslxsec ADROI_bilat.paslxsec or globalCBF_nocerebellum.paslxsec
#old data doesn't have delta_t" "tiv" "gm" "wm" "label"
bind_rows(old_imaging$imaging_pasl%>%
clean_names()%>%
rename_with(~str_remove_all(.x, '_paslxsec'))%>%
mutate(source = 'workbook_pasl')%>%
rename(DCDate = dc_date),
mri_box_data_modified_clean_names$imaging_p_asl_cbf_pvc_gm_all_production%>%
rename(DCDate = dc_date)%>%
filter(label == 'Mean')%>%
mutate(source = 'new_pasl'))%>%
rename(PIDN = pidn)%>%
arrange(PIDN, DCDate, factor(source, levels = c('workbook_pasl', 'new_pasl')))%>%
relocate(source, .after = DCDate)%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
#removing cols that aren't in both
select(-c(lh_adroi:label), -c(scanner_id:as_lpipeline), -global_cbf_nocerebellum)->
imaging_pasl
bind_rows(mri_box_data_modified_clean_names$imaging_pc_asl_cbf_pvc_gm_all_production%>%
filter(label == 'Mean')%>%
mutate(source = 'new_pcasl')%>%
rename(DCDate = dc_date,
PIDN = pidn),
mri_box_data_modified_clean_names$imaging_cbf_pvc_gm%>%
rename(PIDN = pidn, DCDate = dc_date)%>%
mutate(source = 'new_pcasl')%>%
filter(label == 'Mean'),
old_imaging$imaging_pcasl%>%
clean_names()%>%
rename_with(~str_remove_all(.x, '_pcasl'))%>%
mutate(source = 'workbook_pcasl')%>%
rename(DCDate = dc_date,
PIDN = pidn))%>%
arrange(PIDN, DCDate, factor(source, levels = c('workbook_pcasl', 'new_pcasl')))%>%
relocate(source, .after = DCDate)%>%
select(-c(delta_t, tiv, gm, wm, label))%>%
relocate(PIDN, DCDate, source, scanner_id, source_id, as_lpipeline, proc_status, global_cbf_nocerebellum, sort(colnames(.)))%>%
rename(asl_pipeline = as_lpipeline)%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
select(-c(scanner_id:asl_pipeline), -global_cbf_nocerebellum) ->
imaging_pcasl
old_imaging$imaging_dti_v2%>%
rename(date = DCDate)%>%
clean_names()%>%
mutate(across(-c(pidn, date, scanner_id, source_id, dt_ipipeline, proc_status_dti2, dtiv2_v2_v2_v1_v1_converted), ~as.numeric(.x)))%>%
rename_with(~str_replace_all(., "fornix_stria_terminalis", "fornix"))%>%
mutate(source = 'old_workbook')%>%
arrange(pidn, date)%>%
relocate(source, .after = date)%>%
rename(PIDN = pidn, DCDate = date)%>%
distinct(PIDN, DCDate, .keep_all = TRUE)->
dti_v2
old_imaging$imaging_dti_v6%>%
rename(date = DCDate)%>%
clean_names()%>%
mutate(across(-c(pidn, date, scanner_id, source_id, dt_ipipeline, proc_status_dti6), ~as.numeric(.x)))%>%
rename_with(~str_replace_all(., "fornix_stria_terminalis", "fornix"))%>%
mutate(source = 'old_workbook')%>%
rename(PIDN = pidn, DCDate = date)%>%
distinct(PIDN, DCDate, .keep_all = TRUE)->
dti_v6
#t1 data can be combined. Need guidence on what to do with unmatched cols
new_imaging_gm_all_production_cleaned_renamed <- mri_box_data_modified_clean_names$imaging_gm_all_production%>%
filter(label == 'Sum')%>%
mutate(source = 'new_gm_all',
across(c(tiv, gm, wm), ~ .x / 1000000),
across(left_lateral_ventricle:right_unsegmented_white_matter, ~.x / 1000))%>%
rename(PIDN = pidn, DCDate = dc_date)%>%
rename_with(~str_remove(., "^ctx_"))
old_imaging_t1_formatted_renamed <- old_imaging$imaging_t1%>%
mutate(source = 'old_workbook')%>%
mutate(across(-c(PIDN:t1pipeline, source), ~ as.double(.x)))%>%
rename(wm = wmv,
gm = gmv)
bind_rows(new_imaging_gm_all_production_cleaned_renamed,
old_imaging_t1_formatted_renamed)%>%
relocate(source, .after = DCDate)%>%
arrange(PIDN, DCDate)%>%
select(-label, -delt_t, -delta_t)%>%
distinct(PIDN, DCDate, factor(source, levels = c('old_workbook', 'new_gm_all')), .keep_all = TRUE)-> t1s_combined
mesoscale <- read_csv(str_c(datafolder,"MSD Mesoscale Updated Dataset 2019/allMesoCombined_4_20.csv"))
mesoscale%>%
rename(DCDate = meso_specDate)%>%
mutate(DCDate = as_date(DCDate, format = '%m/%d/%Y'))%>%
relocate(c(PIDN,DCDate), .before = mesoID) ->
mesoscale
bind_rows(lava$hbmsdinflammation%>%
select(-instr_type, -v_type, -dc_status, -age_at_dc,-instr_id)%>%
mutate(across(everything(), function(x) {replace(x, which(x < 0), NA)})),
mesoscale%>%
rename_with(~str_remove(.x, "\\.x$"))%>%
select(-mesoID,-mesoBatch, -mesoBatchDate, -SITE, -ID))%>%
arrange(PIDN,DCDate, rowSums(is.na(.)))%>%
distinct(PIDN,DCDate,.keep_all = TRUE)%>%
mutate(chem_eotaxin_clncv = ifelse(chem_eotaxin_cv > 0.2 & !is.na(chem_eotaxin_cv), NA, chem_eotaxin),
chem_eotaxin_3_clncv= ifelse(chem_eotaxin_3_cv > 0.2 & !is.na(chem_eotaxin_3_cv), NA, chem_eotaxin_3),
cyt_ifn_gamma_clncv = ifelse(cyt_ifn_gamma_cv > 0.2 & !is.na(cyt_ifn_gamma_cv), NA, cyt_ifn_gamma),
cyt_il_10_clncv = ifelse(cyt_il_10_cv > 0.2 & !is.na(cyt_il_10_cv), NA, cyt_il_10),
cyt_il_12p70_clncv = ifelse(cyt_il_12p70_cv > 0.2 & !is.na(cyt_il_12p70_cv), NA, cyt_il_12p70),
cyt_il_13_clncv= ifelse(cyt_il_13_cv > 0.2 & !is.na(cyt_il_13_cv), NA, cyt_il_13),
cyt_il_1beta_clncv = ifelse(cyt_il_1beta_cv > 0.2 & !is.na(cyt_il_1beta_cv), NA, cyt_il_1beta),
cyt_il_2_clncv= ifelse(cyt_il_2_cv > 0.2 & !is.na(cyt_il_2_cv), NA, cyt_il_2),
cyt_il_4_clncv = ifelse(cyt_il_4_cv > 0.2 & !is.na(cyt_il_4_cv), NA, cyt_il_4),
cyt_il_6_clncv = ifelse(cyt_il_6_cv > 0.2 & !is.na(cyt_il_4_cv), NA, cyt_il_6),
cyt_il_8_clncv = ifelse(cyt_il_8_cv > 0.2 & !is.na(cyt_il_8_cv), NA, cyt_il_8),
il_8_p_clncv = ifelse(il_8_p_cv > 0.2 & !is.na(il_8_p_cv), NA, il_8_p),
chem_ip_10_clncv = ifelse(chem_ip_10_cv > 0.2 & !is.na(chem_ip_10_cv), NA,chem_ip_10),
chem_mcp_1_clncv = ifelse(chem_mcp_1_cv > 0.2 & !is.na(chem_mcp_1_cv), NA, chem_mcp_1),
chem_mcp_4_clncv = ifelse(chem_mcp_4_cv > 0.2 & !is.na(chem_mcp_4_cv), NA, chem_mcp_4),
chem_mdc_clncv = ifelse(chem_mdc_cv > 0.2 & !is.na(chem_mdc_cv), NA, chem_mdc),
chem_mip_1alpha_clncv = ifelse(chem_mip_1alpha_cv > 0.2 & !is.na(chem_mip_1alpha_cv), NA, chem_mip_1alpha),
chem_mip_1beta_clncv = ifelse(chem_mip_1beta_cv > 0.2 & !is.na(chem_mip_1beta_cv), NA, chem_mip_1beta),
chem_tarc_clncv = ifelse(chem_tarc_cv > 0.2 & !is.na(chem_tarc_cv), NA, chem_tarc),
cyt_tnf_alph_clncv = ifelse(cyt_tnf_alph_cv > 0.2 & !is.na(cyt_tnf_alph_cv), NA, cyt_tnf_alph),
ang_bfgf_clncv = ifelse(ang_bfgf_cv > 0.2 & !is.na(ang_bfgf_cv), NA, ang_bfgf),
ang_flt_1_clncv = ifelse(ang_flt_1_cv > 0.2 & !is.na(ang_flt_1_cv), NA, ang_flt_1),
ang_plgf_clncv = ifelse(ang_plgf_cv > 0.2 & !is.na(ang_plgf_cv), NA, ang_plgf),
ang_tie_2_clncv = ifelse(ang_tie_2_cv > 0.2 & !is.na(ang_tie_2_cv), NA, ang_tie_2),
ang_vegf_clncv = ifelse(ang_vegf_cv > 0.2 & !is.na(ang_vegf_cv), NA, ang_vegf),
ang_vegf_c_clncv = ifelse(ang_vegf_c_cv > 0.2 & !is.na(ang_vegf_c_cv), NA, ang_vegf_c),
ang_vegf_d_clncv = ifelse(ang_vegf_d_cv > 0.2 & !is.na(ang_vegf_d_cv), NA, ang_vegf_d),
vasc_crp_clncv = ifelse(vasc_crp_cv > 0.2 & !is.na(vasc_crp_cv), NA, vasc_crp),
vasc_icam_1_clncv = ifelse(vasc_icam_1_cv > 0.2 & !is.na(vasc_icam_1_cv), NA, vasc_icam_1),
vasc_saa_clncv = ifelse(vasc_saa_cv > 0.2 & !is.na(vasc_saa_cv), NA, vasc_saa),
vasc_vcam_1_clncv = ifelse(vasc_vcam_1_cv > 0.2 & !is.na(vasc_vcam_1_cv), NA, vasc_vcam_1))%>%
select(PIDN, DCDate, ang_bfgf, ang_bfgf_cv, ang_bfgf_clncv, ang_flt_1, ang_flt_1_cv, ang_flt_1_clncv,
ang_plgf, ang_plgf_cv, ang_plgf_clncv, ang_tie_2, ang_tie_2_cv, ang_tie_2_clncv,
ang_vegf, ang_vegf_cv, ang_vegf_clncv, ang_vegf_c, ang_vegf_c_cv, ang_vegf_c_clncv, ang_vegf_cv, ang_vegf_clncv,
ang_vegf_d, ang_vegf_d_cv, ang_vegf_d_clncv, chem_eotaxin, chem_eotaxin_cv, chem_eotaxin_clncv,
chem_eotaxin_3, chem_eotaxin_3_cv, chem_eotaxin_3_clncv, chem_ip_10, chem_ip_10_cv, chem_ip_10_clncv,
chem_mcp_1, chem_mcp_1_cv, chem_mcp_1_clncv, chem_mcp_4, chem_mcp_4_cv, chem_mcp_4_clncv,
chem_mdc, chem_mdc_cv, chem_mdc_clncv, chem_mip_1alpha, chem_mip_1alpha_cv, chem_mip_1alpha_clncv,
chem_mip_1beta, chem_mip_1beta_cv, chem_mip_1beta_clncv, chem_tarc, chem_tarc_cv, chem_tarc_clncv,
cyt_ifn_gamma, cyt_ifn_gamma_cv, cyt_ifn_gamma_clncv, cyt_il_10, cyt_il_10_cv, cyt_il_10_clncv,
cyt_il_12p70, cyt_il_12p70_cv, cyt_il_12p70_clncv, cyt_il_13, cyt_il_13_cv, cyt_il_13_clncv, cyt_il_1beta,
cyt_il_1beta_cv, cyt_il_1beta_clncv, cyt_il_2, cyt_il_2_cv, cyt_il_2_clncv, cyt_il_4, cyt_il_4_cv,
cyt_il_4_clncv, cyt_il_6, cyt_il_6_cv, cyt_il_6_clncv, cyt_il_8, cyt_il_8_cv,
cyt_il_8_clncv, cyt_tnf_alph, cyt_tnf_alph_cv, cyt_tnf_alph_clncv, il_8_p,
il_8_p_cv, il_8_p_clncv, mesoPlate, pro_gm_csf, pro_il_1,
pro_il_12p40, pro_il_15, pro_il_16, pro_il_17, pro_il_5, pro_il_7,
pro_tnf, pro_vegf, vasc_crp, vasc_crp_cv, vasc_crp_clncv, vasc_icam_1,
vasc_icam_1_cv, vasc_icam_1_clncv, vasc_saa, vasc_saa_cv, vasc_saa_clncv, vasc_vcam_1,
vasc_vcam_1_cv, vasc_vcam_1_clncv) ->
mesoscale
quanterix <- read_excel(str_c(datafolder,"Quanterix/plasma quanterix data_021622.xlsx"))
quanterix_fitbit <- read_excel(str_c(datafolder,'Quanterix/Casaletto_Plasma_Simoa_Quanterix_pTau181_04052023.xlsx'))
quanterix%>%
relocate(c(PIDN, `Sample Date`), .before = `HB Unqid`)%>%
mutate(across(c(`plasma ttau`:last_col()), ~as.numeric(.)))%>%
select(PIDN, `Sample Date`, `plasma ttau`, `plasma ttau_cv`, `plasma ptau181`,
`plasma ptau181_cv`, `plasma ab40`, `plasma ab40_cv`, `plasma ab42`,
`plasma ab42_cv`, `plasma gfap`, `plasma gfap_cv`, `plasma nfl`, `plasma nfl_cv`)%>%
clean_names()%>%
rename(PIDN = pidn,
DCDate = sample_date)%>%
mutate(DCDate = as_date(DCDate)) ->
quanterix
quanterix_fitbit%>%
clean_names()%>%
rename(PIDN = pidn,
DCDate = sample_date)%>%
mutate(DCDate = as_date(DCDate),
analysis_date = as_date(analysis_date),
across(c(PIDN, sample_id, kit_lot_number, results, sd, cv_percent, run_id, cv), ~ as.numeric(.x)))%>%
select(PIDN, DCDate, analyte, specimen_type, sample_origin, sample_id, visit_number, kit_lot_number, results, units, sd, cv_percent, run_id, cv, protocol) ->
quanterix_fitbit
list.files(str_c(datafolder,'genetics'), recursive = TRUE, all.files = FALSE, full.names = TRUE)%>%
str_subset( '[~$]|\\.docx', negate = TRUE) -> genetics_files
genetics_data <- read_all_files(genetics_files, prefix = 'genetics', skip = 0, clean_names = FALSE)
# some columns were repeated. using coalesce to join them together on the same name.
genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922%>%
#remove duplicated name suffx ('...1')
split.default(str_remove(names(.), "\\...+")) %>%
# join columns together and add data if data is missing in first instance of column
map_dfc(~ exec(coalesce, !!!.x))%>%
# rename column that wasn't suffixed
rename(CPEB3_rs11186856 = CPEB3)%>%
# convert month names to number and reorder to (e.g. jan-3 -> 3/1)-- undoing excels auto date detection
mutate(across(everything(),
~ifelse(str_detect(str_remove_all(.x,'[^[[:alpha:]]]'), paste(month.abb, collapse = "|")),
paste(match(str_remove_all(.x,'[^[[:alpha:]]]'),month.abb), parse_number(.x), sep = "/"), .x)),
#convert met to a and val to g
COMT_rs4680 = str_replace_all(COMT_rs4680, 'Met', 'A'),
COMT_rs4680 = str_replace_all(COMT_rs4680, 'Val', 'G')) ->
genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922
#joining omni data together and pivoting so colnames match seq data created the letter suffixes to reorder columns alphabetically-not keeping those columns now so is redundant but keep in case we want to reinclude at any point
full_join(
genetics_data$genetics_genotypes_hillblom2019_genotypes%>%
mutate(a1a2 = paste(A1,'/',A2,sep = ''),
`__b__alt_ref` = paste(Alt, '/', Ref, sep =''))%>%
pivot_wider(id_cols = 'PIDN',
names_from = c(`Variable Name for Sequenom Merge`),
names_glue = '{`Variable Name for Sequenom Merge`}_{.value}',
values_from = c(a1a2, `GC Score`, Notes, Chr, Start, End, `__b__alt_ref`)),
genetics_data$genetics_genotypes_hillblom2019_more_genotypes%>%
mutate(a1a2 = paste(A1,'/',A2,sep = ''),
`__b__alt_ref` = paste(Alt, '/', Ref, sep =''))%>%
pivot_wider(id_cols = 'PIDN',
names_from = c(`Variable Name for Sequenom Merge`),
names_glue = '{`Variable Name for Sequenom Merge`}_{.value}',
values_from = c(a1a2, `GC Score`, Notes, Chr, Start, End, `__b__alt_ref`)), by = 'PIDN')%>%
mutate(across(everything(), ~na_if(., ".")),
across(everything(), ~na_if(., "./.")))%>%
rename_with(~ str_replace(.x, 'GC Score', '__a__gc_score'), ends_with('GC Score'))%>%
rename_with(~ str_replace(.x, 'Chr', '__c__chr'), ends_with('Chr'))%>%
rename_with(~ str_replace(.x, 'Start', '__d__start'), ends_with('Start'))%>%
rename_with(~ str_replace(.x, 'End', '__e__end'), ends_with('End'))%>%
rename_with(~str_replace(.x, .x, str_c(.x,'_omni')), !matches('_a1a2|PIDN'))%>%
rename_with(~str_remove(.x,'_a1a2'))%>%
remove_empty(which = c('rows','cols'))%>%
# reformat any data that was converted to dates in excel
mutate(across(everything(),
~ifelse(str_detect(str_remove_all(.x,'[^[[:alpha:]]]'), paste(month.abb, collapse = "|")),
paste(match(str_remove_all(.x,'[^[[:alpha:]]]'),month.abb), parse_number(.x), sep = "/"), .x)),
#convert met to a and val to g -- prob not necessary but want to do before join to sequneom data
COMT_rs4680 = str_replace_all(COMT_rs4680, 'Met', 'A'),
COMT_rs4680 = str_replace_all(COMT_rs4680, 'Val', 'G')) ->
omni_data
# this joins based on overlap and PIDN, if no match, arranges to prioritize sequenom data, and, selects only columns that contain overlap name, drops empty gene data, and selects only one row-- this is to ensure we have all data that is available for each gene and that sequenom data isnt paired with mismatching omni metadata
# cleaning sequenom data before joining
# find columns that are present in both omni data and sequenom data
overlapping_genes <- intersect(names(omni_data%>%select(-PIDN)),
names(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292))
#find only genes in sequenom data
sequenom_only <- setdiff(names(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292),
names(omni_data%>%select(-PIDN)))
#create list of dfs from omni data - one for each gene and the data there within
overlapping_genes_dfs <- lapply(overlapping_genes,
function(x) omni_data%>%
select(PIDN, contains(x)))%>%
set_names(overlapping_genes)
#join list of omni gene dfs to sequenom data
list_of_joined_genetics_dfs<-map2(overlapping_genes_dfs, overlapping_genes, function(gene_df, gene_name) {
full_join(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922%>%
#indicate where data came from
mutate(sequenom = 'sequenom'),
gene_df%>%mutate(omni = 'omni'))%>%
arrange(PIDN, rowSums(is.na(.)))%>%
select(PIDN, contains(gene_name), sequenom, omni)%>%
group_by(PIDN)%>%
drop_na(gene_name)%>%
# create source column and concat strings from sequnom and omni columns unless theres an empty gc_score column where both omni and seq data exist
mutate(source = ifelse(str_c(gene_name,'___a__gc_score_omni') %in% colnames(.) &&
is.na(get(str_c(gene_name,'___a__gc_score_omni'))) &&
any(!is.na(sequenom)) &&
any(!is.na(omni)),
'sequenom',
str_c(sequenom[!is.na(sequenom)], omni[!is.na(omni)], sep = ' + ')))%>%
select(PIDN, source, gene_name, contains('gc_score'))%>%
rename_with(~str_replace(., 'source', str_c(gene_name, '_source')))%>%
rename_with(~str_remove_all(.x, '__[:alnum:]__'), matches('_[[:alnum:]]_'))%>%
rename_with(~str_remove_all(.x, '_omni'))%>%
distinct(PIDN, .keep_all = TRUE)})
#final genetic data to use
list_of_joined_genetics_dfs%>%
reduce(., coalesce_join, by = 'PIDN')%>%
coalesce_join(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292%>%
#create source column for those columns not in omni data
mutate(across(sequenom_only, .names = '{.col}_source', ~ paste('sequenom'))), by = 'PIDN')%>%
relocate(sort(names(.)))%>%
relocate(PIDN, matches(match = 'apoe', ignore.case = TRUE),
matches(match = 'bdnf', ignore.case = TRUE),
matches(match = 'comt', ignore.case = TRUE),
matches(match = 'DRD4', ignore.case = TRUE),
matches(match = 'SNAP25', ignore.case = TRUE))%>%
mutate(across(contains('gc_score'), ~as.numeric(.)))%>%
# removing gc_score columns from genetics data
select(-ends_with('gc_score'))->
genetics_data
gait <- read_excel(str_c(datafolder,'gait/Gait + SPPB.xlsx'))
gait%>%
clean_names()%>%
rename(DCDate = neuroexam_date,
PIDN = pidn)%>%
mutate(DCDate = as_date(DCDate),
across(c(gait_version, gait_distance, leisure_t1, leisure_t2, leisure_t3, gait_leisure_avg, speed_t1, speed_t2, speed_t3, gait_speed_avg),
~ as.numeric(.x)))%>%
relocate(DCDate, .after = PIDN)%>%
select(PIDN, DCDate, project, gait_version, gait_distance, leisure_t1, leisure_t2, leisure_t3, gait_leisure_avg, speed_t1, speed_t2, speed_t3, gait_speed_avg) ->
gait
pet_data <- read_excel(str_c(datafolder,"pet/Hillblom_AmyloidPET_Final_11-3-22_CY.xlsx"))
#ignore data from lava-PET core cannot validate it. Ignore previous dataset pet data as well. Only use data provided most recently by PETcore team. The thresholds were given by Renaud on 2022-12-09.
# AV45: 1.11 SUVR (Landau et al, JNM 2013; https://pubmed.ncbi.nlm.nih.gov/23166389/)
# PIB: 1.21 SUVR (Villeneuve et al Brain 2015; https://pubmed.ncbi.nlm.nih.gov/25953778/)
# FBB: 1.08 SUVR (Royse et al, ART 2021 https://pubmed.ncbi.nlm.nih.gov/33971965/)
pet_data%>%
rename(DCDate = PETDate)%>%
mutate(PETsuvr = as.numeric(PETsuvr),
PETcentiloids = as.numeric(PETcentiloids),
DCDate = as_date(DCDate),
PET_suvr_threshold = ifelse(PETcompound == 'AV45', 1.11,
ifelse(PETcompound == 'PIB', 1.21,
ifelse(PETcompound == 'FBB', 1.08, NA))),
PET_suvr_threshold_positive = as.numeric(PETsuvr > PET_suvr_threshold)) ->
pet
sleep_data <- read_all_files(str_c(datafolder,"sleep/sleep_team_data.xlsx"), sleep, 0, clean_names = TRUE)
sleep_profiler_data <- read_excel(str_c(datafolder,"sleep/sleep_study_metrics_April_2023.xlsx"))
sleep_data$sleep_questionnaire_data_sleep_team_data <- sleep_data$sleep_questionnaire_data_sleep_team_data%>%
rename(DCDate = sleep_study_date,
Berlin_apneaRisk = berlin_risk,
ISItotal = isi,
psqi_Durat = final_sleep_duration_score,
psqi_Disturb = final_sleep_disturbance_score,
psqi_Laten = final_sleep_latency_score,
psqi_daydys = final_daytime_dysfunction_score,
psqi_slpqual = psqi_slpqual,
psqi_meds = psqi_meds,
psqi_hse = final_hse_score,
psqi_PSQItot = global_psqi_score,
ESStotal = ess_total)%>%
mutate(ISIdx = ifelse(ISItotal <= 7, "no clinical insomnia",
ifelse((ISItotal >= 8) & (ISItotal <= 14), "subthreshold insomnia",
ifelse((ISItotal >= 15) & (ISItotal <= 21), "clinical insomnia (moderate)",
ifelse((ISItotal >= 22) & (ISItotal <= 28), "clinical insomnia", NA)))),
psqi_totBinary = ifelse(psqi_PSQItot >= 6, 1, 0),
DCDate = as_date(DCDate))%>%
#filtering out data with less than 75% of data (NAs or all 0s). see as_date('2022-03-12') & PIDN == 23898
filter(rowSums(is.na(.) | . == 0) < .75 * ncol(.))
sleep_data$sleep_questionnaire_data_sleep_team_data%>%
select(PIDN, DCDate, bmi, cti_lv, cti_fr, sps_cbc, sps_ac, sps_total,
fosq_general_productivity, fosq_social_outcome, fosq_activity_level,
fosq_vigilance, fosq_intimate_rels_sexual_activity, fosq_total, meq_morn_total, meq_score) ->
sleep_instruments
# not using the averages at the moment-these data are also outdated and have been replaced with updated sleep profiler data from sleep team. Can use the code to conver new data to averages if requested
# sleep_data$sleep_sleep_profiler_data_sleep_team_data <- sleep_data$sleep_sleep_profiler_data_sleep_team_data%>%
# group_by(PIDN)%>%
# fill(sleep_study_date, .direction = 'down')%>%
# rename(DCDate = sleep_study_date)%>%
# group_by(PIDN, DCDate)%>%
# mutate(DCDate = as_date(DCDate))%>%
# mutate(across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
# ~mean(.x),.names = '{.col}_avg'),
# across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
# ~sd(.x),.names = '{.col}_sd'),
# across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
# ~max(.x),.names = '{.col}_max'),
# across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
# ~min(.x),.names = '{.col}_min'),
# num_nights = max(night))%>%
# select(PIDN, DCDate, num_nights, ends_with('avg'), ends_with('min'), ends_with('max'), ends_with('sd'))%>%
# relocate(PIDN, DCDate, num_nights, sort(names(.)))%>%
# distinct(PIDN, DCDate, .keep_all = TRUE)%>%
# select(-actual_sleep_time_min, -total_waso_min, -sleep_latency_min) ->
# sleep_profiler
sleep_profiler_data%>%
rename(DCDate = StudyDate)%>%
mutate(DCDate = as_date(DCDate))%>%
arrange(PIDN, DCDate) %>%
group_by(PIDN, grp = cumsum(c(TRUE, diff(Night) < 0)))%>%
mutate(startdate = min(DCDate), enddate = max(DCDate))%>%
relocate(startdate, enddate, .after = DCDate)%>%
pivot_wider(id_cols = c('PIDN', 'startdate', 'enddate'),
values_from = -c('PIDN', 'DCDate', 'Sleep_Profiler_Device_SN', 'startdate', 'enddate', 'Night', 'grp'),
names_from = 'Night',
names_glue = '{.value}_night_{Night}') ->
sleep_profiler
sleep_data$sleep_apnea_data_sleep_team_data%>%
rename(DCDate = apnealink_recordingdate)%>%
mutate(DCDate = as_date(DCDate)) ->
apnea
qualtrics_data$qualtrics_epworth_sleepiness_scale%>%
bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
select(PIDN, DCDate, ESStotal)%>%
mutate(source = 'sleep_team')%>%
filter(!is.na(ESStotal)))%>%
group_by(PIDN, DCDate)%>%
arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
rename(age = what_is_your_age_in_years,
sitting_and_reading = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_and_reading,
watching_tv = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_watching_tv,
inactive_in_a_public_place =
how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_inactive_in_a_public_place_e_g_a_theater_or_a_meeting,
passenger_in_car =
how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_as_a_passenger_in_a_car_for_an_hour_without_a_break,
rest_in_afternoon =
how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_lying_down_to_rest_in_the_afternoon_when_circumstances_permit,
sitting_and_talking = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_and_talking_to_someone,
after_lunch = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_quietly_after_a_lunch_without_alcohol,
in_car_while_stopped =
how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_in_a_car_while_stopped_for_a_few_minutes_in_the_traffic,
sum_high_chance_of_dozing = high,
sum_mod_chance_of_dozing = moderate,
sum_slight_chance_of_dozing = slight,
sum_would_never_doze = no)%>%
select(-what_is_your_sex, -age)%>%
prioritize_qualtrics_data() ->
epworth_sleepiness
qualtrics_data$qualtrics_insomnia_severity_index_isi%>%
bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
select(PIDN, DCDate, ISIdx, ISItotal)%>%
filter(!is.na(ISItotal))%>%
mutate(source = 'sleep_team'))%>%
group_by(PIDN, DCDate)%>%
arrange(PIDN, DCDate, factor(source, levels = c('qualtrics','sleep_team', 'prev_dataset')))%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
prioritize_qualtrics_data() ->
insomnia_severity_index
qualtrics_data$qualtrics_pittsburgh_sleep_quality_index%>%
mutate(instr_type = 'PSQI')%>%
select(PIDN, DCDate, source, instr_type, psqi_PSQItot, psqi_totBinary)%>%
bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
select(PIDN, DCDate, psqi_PSQItot, psqi_totBinary)%>%
mutate(source = 'sleep_team',
instr_type = 'PSQI')%>%
filter(!is.na(psqi_PSQItot)))%>%
group_by(PIDN, DCDate)%>%
arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
prioritize_qualtrics_data() ->
psqi
qualtrics_data$qualtrics_berlin_sleep_questionnaire%>%
bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
select(PIDN, DCDate, Berlin_apneaRisk)%>%
mutate(source = 'sleep_team')%>%
filter(!is.na(Berlin_apneaRisk)))%>%
group_by(PIDN, DCDate)%>%
arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
mutate(Berlin_apneaRisk = ifelse(Berlin_apneaRisk == 'HighRisk', 'high risk',
ifelse(Berlin_apneaRisk == 'LowRisk', 'low risk', tolower(Berlin_apneaRisk))))%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
prioritize_qualtrics_data()->
berlin_sleep
old_dataset <- read_csv(str_c(datafolder,'old_dataset/HBSpring2020_2020-06-18.T1.WMH.v2.v6.SNI.Repro.moregene.meso.champscomp.newPET.newFitbit.PRSupdated.csv'))
old_dataset%>%
select(PIDN, csfdrawDate, csfSpecID:GAP43..156.10000.pg.ml..CSF)%>%
mutate(csfdrawDate = as_date(csfdrawDate, format = '%m/%d/%Y'))%>%
rename(DCDate = csfdrawDate)%>%
filter(!is.na(DCDate) & !is.na(csfzetterbergSpecID))%>%
select(PIDN, DCDate, csfSpecID, AB1.40..pg.mL., AB1.42..pg.mL.,
pTau...pg.mL., tTau...pg.mL., NG36..pg.mL., SYT1..pM.,
SNAP25long..pM.,SNAP25tot..pM.,GAP43..156.10000.pg.ml..CSF) ->
csf
lava$bedsidealternates<-lava$bedsidealternates%>%mutate(instr_id_alternates = instr_id)%>%relocate(instr_id_alternates, .after = PIDN)
lava$bedsidescreen_witholdvars<-lava$bedsidescreen_witholdvars%>%mutate(instr_id_old_vars = instr_id)%>%relocate(instr_id_old_vars, .after = PIDN)
lava$neuropsychbedside<-lava$neuropsychbedside%>%mutate(instr_id_np_bedside = instr_id_11)%>%relocate(instr_id_np_bedside, .after = DCDate)
lava$neuropsychcvlt<-lava$neuropsychcvlt%>%mutate(instr_id_np_cvlt_bedside = instr_id)%>%relocate(instr_id_np_cvlt_bedside, .after = PIDN)
# keeps only the vars hillblom wants, joins all sources from lava, and creates wrat_baseline and wrat_baseline_date var. Replaces negative values with NAs.
lava$bedsidescreen%>%
left_join(lava$bedsidealternates)%>%
left_join(lava$bedsidescreen_witholdvars)%>%
left_join(lava$neuropsychbedside)%>%
left_join(lava$neuropsychcvlt)%>%
slice(0)%>%
bind_rows(lava$neuropsychbedside%>%
mutate(source='np_bs'),
lava$bedsidescreen%>%
mutate(source='bs'),
lava$bedsidealternates%>%
mutate(source = 'bsa'),
lava$neuropsychcvlt%>%
mutate(source = 'np_cvlt'),
lava$bedsidescreen_witholdvars%>%
mutate(source='bsov'))%>%
group_by(PIDN, DCDate)%>%
#sets instr_id as instr_id_11 or instr_id_141 if instr_id is empty or doesn't exist
mutate(instr_id = ifelse(is.na(instr_id), instr_id_11, ifelse(is.na(instr_id) & is.na(instr_id_11), instr_id_141, instr_id)))%>%
mutate(across(1:instr_id_np_cvlt_bedside, function(x) {replace(x, which(x<0), NA)}))%>%
select(PIDN:instr_id, source, mmse_tot:mod_rey, digit_fw, digit_bw, wrat_tot, d_corr:rey_recg, bnt_corr:numb_loc, gds1:gds15to,
instr_id_alternates, instr_type_2, cv2form:cv2rd, mod_rey_b:rey_b_recg)%>%
#arranges by row with most data, fills missing values if they exist on a row with same date
arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
fill(everything(), .direction = "up")%>%
#creates cols to identify where data came from if filled form other row
mutate(other_instr_ids = str_c(unique(c(paste(instr_id))), collapse = ", "))%>%
mutate(other_instr_types = str_c(unique(c(instr_type)), collapse = ", "))%>%
relocate(source, other_instr_ids, other_instr_types, .after = instr_id)%>%
#creating wrat cols to join by pidn
left_join(lava$bedsidescreen%>%
select(PIDN,DCDate,wrat_tot)%>%
group_by(PIDN)%>%
filter(!is.na(wrat_tot) & wrat_tot >=0)%>%
arrange(PIDN,DCDate)%>%slice(1)%>%
rename(wrat_baseline_date = DCDate, wrat_baseline = wrat_tot), by = 'PIDN')%>%
mutate(wrat_on_or_before_current_date = wrat_baseline_date <= DCDate)%>%
relocate(wrat_baseline_date, wrat_baseline, wrat_on_or_before_current_date, .before = wrat_tot)%>%
distinct(PIDN,DCDate, .keep_all = TRUE)%>%
select(-age_at_dc) ->
bedside
## Modified Trails
bedside$mt_ratio <- (60*bedside$mt_corr)/bedside$mt_time
bedside$mt_ln <- log(bedside$mt_ratio+1)
## CVLT Long
bedside$cv2fptot <- bedside$cv2b_r + bedside$cv2b_u + bedside$cv2np + bedside$cv2nu
bedside$cv2phit <- ifelse(bedside$cv2hit == 16, 15.5/16,
#error found here. ifelse statement below had cv2fptot in place of where cv2hit should've been,
#creating differences in this data compared to the previous dataset
ifelse(bedside$cv2hit==0, 0.5/16, bedside$cv2hit/16))
bedside$cv2pfp <- ifelse(bedside$cv2fptot==32,31.5/32,
ifelse(bedside$cv2fptot==0,0.5/32,bedside$cv2fptot/32))
bedside$cv2zhit = qnorm(bedside$cv2phit)
bedside$cv2zfp = qnorm(bedside$cv2pfp)
bedside$cv2dprime=bedside$cv2zhit-bedside$cv2zfp
bedside$cv2bias=-.5*(bedside$cv2zhit+bedside$cv2zfp)
## 1 Back
lava$`1back`$nb1Hits <- ifelse(lava$`1back`$nb1c > 5 & lava$`1back`$nb1smc > 0 ,
(lava$`1back`$nb1smc+0.5)/11,NA)
lava$`1back`$nb1TotalNo <- lava$`1back`$nb1s1c + lava$`1back`$nb1s2c + lava$`1back`$nb1s3c + lava$`1back`$nb1s4c
lava$`1back`$nb1FalseAlarms<-(20 - lava$`1back`$nb1TotalNo + 0.5) / 21
lava$`1back`$nb1ZFA <- qnorm(lava$`1back`$nb1FalseAlarms)
lava$`1back`$nb1ZHIT <- qnorm(lava$`1back`$nb1Hits)
lava$`1back`$nb1dprime <- lava$`1back`$nb1ZHIT-lava$`1back`$nb1ZFA
## 2 Back
lava$`2back`$nb2Hits <- ifelse(lava$`2back`$nb2c > 10 & lava$`2back`$nb2smc > 0 ,
(lava$`2back`$nb2smc + 0.5)/31,NA)
lava$`2back`$nb2TotalNo <- lava$`2back`$nb2s1c + lava$`2back`$nb2s2c + lava$`2back`$nb2s3c + lava$`2back`$nb2s4c
lava$`2back`$nb2FalseAlarms<-(60 - lava$`2back`$nb2TotalNo + 0.5) / 61
lava$`2back`$nb2ZFA <- qnorm(lava$`2back`$nb2FalseAlarms)
lava$`2back`$nb2ZHIT <- qnorm(lava$`2back`$nb2Hits)
lava$`2back`$nb2dprime <- lava$`2back`$nb2ZHIT-lava$`2back`$nb2ZFA
lava$`2back`$nb2bias <- (lava$`2back`$nb2ZHIT + lava$`2back`$nb2ZFA)/2
## Flanker eprime
lava$enclosedflanker$flkincacc <- lava$enclosedflanker$ef_inc_cnt/40
lava$enclosedflanker$flkincaccscore <- 5*lava$enclosedflanker$flkincacc
lava$enclosedflanker$flkinclog <- log10(lava$enclosedflanker$ef_inc_med)
lava$enclosedflanker$flkincstem <- (lava$enclosedflanker$flkinclog-log10(400))/(log10(800)-log10(400))
lava$enclosedflanker$flkincrtscore <- 5-(5*lava$enclosedflanker$flkincstem)
lava$enclosedflanker$flankerinc <- ifelse(lava$enclosedflanker$ef_inc_cnt > 10,lava$enclosedflanker$flkincaccscore+lava$enclosedflanker$flkincrtscore,NA)
## SetShifting eprime
lava$setshifting$shiftacc <- lava$setshifting$all_shft_c_shift_corr / (lava$setshifting$all_shft_c_shift_corr + lava$setshifting$all_shft_e_shift_errors)
lava$setshifting$shiftaccscore <- 5*lava$setshifting$shiftacc
lava$setshifting$all_shft_m_shift_median[lava$setshifting$all_shft_m_shift_median<400] <- 400
lava$setshifting$all_shft_m_shift_median[lava$setshifting$all_shft_m_shift_median>2800] <- 2800
lava$setshifting$shftlog <- log10(lava$setshifting$all_shft_m_shift_median)
lava$setshifting$shftstem <- (lava$setshifting$shftlog - log10(450))/(log10(1600) - log10(450))
lava$setshifting$shiftrtscore <- 5-(5*lava$setshifting$shftstem)
lava$setshifting$shiftscore <- lava$setshifting$shiftaccscore + lava$setshifting$shiftrtscore
## Processing Speed
lava$infoprocessingspeed$animyesz <- ifelse(lava$infoprocessingspeed$anim_yes_acc >= 70,(lava$infoprocessingspeed$anim_yes_med - 595.01)/ 83.42073526,NA)
lava$infoprocessingspeed$animnoz <- ifelse(lava$infoprocessingspeed$anim_no_acc >= 70,(lava$infoprocessingspeed$anim_no_med - 586.48) / 78.34211216,NA)
lava$infoprocessingspeed$line10z <- ifelse(lava$infoprocessingspeed$lines_10_acc >= 70,(lava$infoprocessingspeed$lines_10_med - 606.74) /129.6893719,NA)
lava$infoprocessingspeed$line20z <- ifelse(lava$infoprocessingspeed$lines_20_acc >= 70,(lava$infoprocessingspeed$lines_20_med - 545.98) / 111.1612913,NA)
lava$infoprocessingspeed$rhymeyesz <- ifelse(lava$infoprocessingspeed$rhyme_yes_acc >= 70,(lava$infoprocessingspeed$rhyme_yes_med - 1300.34) / 252.0888475,NA)
lava$infoprocessingspeed$rhymenoz <- ifelse(lava$infoprocessingspeed$rhyme_no_acc >= 70,(lava$infoprocessingspeed$rhyme_no_med - 1247.78) / 215.990472,NA)
#added rotate to existing composite scripts
lava$infoprocessingspeed$rotate60z <- ifelse(lava$infoprocessingspeed$rotate_60_acc >= 70, (lava$infoprocessingspeed$rotate_60_med - 1531.53) / 482.8357335, NA)
lava$infoprocessingspeed$rotate120z <- ifelse(lava$infoprocessingspeed$rotate_120_acc >= 70, (lava$infoprocessingspeed$rotate_120_med - 1922.08) / 680.7012085, NA)
lava$infoprocessingspeed$search16nz <- ifelse (lava$infoprocessingspeed$search_16n_acc >= 70,(lava$infoprocessingspeed$search_16n_med - 1189.21) /357.3644424,NA)
lava$infoprocessingspeed$search16yz <- ifelse (lava$infoprocessingspeed$search_16y_acc >= 70,(lava$infoprocessingspeed$search_16y_med - 793.00) /124.7270867,NA)
lava$infoprocessingspeed$search24nz <- ifelse (lava$infoprocessingspeed$search_24n_acc >= 70,(lava$infoprocessingspeed$search_24n_med - 1465.84) /461.1421215,NA)
lava$infoprocessingspeed$search24yz <- ifelse (lava$infoprocessingspeed$search_24y_acc >= 70,(lava$infoprocessingspeed$search_24y_med - 855.75) /147.8585169,NA)
lava$infoprocessingspeed$wordyesz <- ifelse(lava$infoprocessingspeed$word_yes_acc >= 70,(lava$infoprocessingspeed$word_yes_med - 633.19) / 101.3623032,NA)
lava$infoprocessingspeed$wordnoz <- ifelse(lava$infoprocessingspeed$word_no_acc > 70,(lava$infoprocessingspeed$word_no_med - 622.24) /95.85706423,NA)
lava$infoprocessingspeed$match21z <- ifelse(lava$infoprocessingspeed$match2_l1_acc >= 70,(lava$infoprocessingspeed$match2_l1_med - 1790.48) /597.9149021,NA)
lava$infoprocessingspeed$match22z <- ifelse(lava$infoprocessingspeed$match2_l2_acc >= 70,(lava$infoprocessingspeed$match2_l2_med - 2165.30) / 746.3290314,NA)
lava$infoprocessingspeed$match23z <- ifelse(lava$infoprocessingspeed$match2_l3_acc >= 70,(lava$infoprocessingspeed$match2_l3_med - 1974.99) / 602.7246202,NA)
#added this--not in previous script but was sent by Joel
lava$infoprocessingspeed$rotatez= .5 * (lava$infoprocessingspeed$rotate60z + lava$infoprocessingspeed$rotate120z)
v <-c("pronz","animz","rhymez","wordz")
lava$infoprocessingspeed$pronz <- ifelse(lava$infoprocessingspeed$pron_acc >= 70,(lava$infoprocessingspeed$pron_med - 1954.01) /605.5701116,NA)
lava$infoprocessingspeed$animz <- 0.5*(lava$infoprocessingspeed$animyesz + lava$infoprocessingspeed$animnoz)
lava$infoprocessingspeed$rhymez <- 0.5*(lava$infoprocessingspeed$rhymeyesz + lava$infoprocessingspeed$rhymenoz)
lava$infoprocessingspeed$wordz <- 0.5*(lava$infoprocessingspeed$wordyesz + lava$infoprocessingspeed$wordnoz)
lava$infoprocessingspeed$nmverbal <- apply(!is.na(lava$infoprocessingspeed[,v]), 1, sum)
s <- c("dotz","match1z","shapez","match2z","linez","searchz")
lava$infoprocessingspeed$dotz <- ifelse(lava$infoprocessingspeed$dot_acc >= 70,(lava$infoprocessingspeed$dot_med - 613.73) /129.6581279,NA)
lava$infoprocessingspeed$match1z <- ifelse(lava$infoprocessingspeed$match_acc >= 70,(lava$infoprocessingspeed$match_med - 1193.08) / 295.5833986,NA)
lava$infoprocessingspeed$shapez <- ifelse(lava$infoprocessingspeed$shape_acc >= 70,(lava$infoprocessingspeed$shape_med - 697.95) / 129.4176206,NA)
lava$infoprocessingspeed$match2z <- 0.33*(lava$infoprocessingspeed$match21z + lava$infoprocessingspeed$match22z + lava$infoprocessingspeed$match23z)
lava$infoprocessingspeed$linez <- 0.5*(lava$infoprocessingspeed$line10z + lava$infoprocessingspeed$line20z)
lava$infoprocessingspeed$searchz=0.25* (lava$infoprocessingspeed$search16nz + lava$infoprocessingspeed$search16yz + lava$infoprocessingspeed$search24nz + lava$infoprocessingspeed$search24yz)
lava$infoprocessingspeed$nmspatial <- apply(!is.na(lava$infoprocessingspeed[,s]), 1, sum)
## Verbal composite
lava$infoprocessingspeed$vsumz <- ifelse(lava$infoprocessingspeed$nmverbal>=3,rowSums(lava$infoprocessingspeed[,v], na.rm=TRUE),NA)
lava$infoprocessingspeed$vngood <- as.numeric(lava$infoprocessingspeed$nmverbal)
lava$infoprocessingspeed$verbal = lava$infoprocessingspeed$vsumz/lava$infoprocessingspeed$vngood
## Spatial composite
lava$infoprocessingspeed$ssumz <- ifelse(lava$infoprocessingspeed$nmspatial>=5,rowSums(lava$infoprocessingspeed[,s], na.rm=TRUE),NA)
lava$infoprocessingspeed$sngood <- as.numeric(lava$infoprocessingspeed$nmspatial)
lava$infoprocessingspeed$spatial = lava$infoprocessingspeed$ssumz/lava$infoprocessingspeed$sngood
## Memory score
bedside$immrecall <- bedside$cv2t1c + bedside$cv2t2c + bedside$cv2t3c + bedside$cv2t4c + bedside$cv2t5c
bedside$bensonrecall <-ifelse(!is.na(bedside$rey10m), bedside$rey10m, bedside$rey_b10m)
m <-c("immrecallz","cv2lfrcz","cv2dprimez","bensonrecallz")
bedside$immrecallz <- (bedside$immrecall - 50.8277) /10.45484
bedside$cv2lfrcz <- (bedside$cv2lfrc- 11.55) /3.066
bedside$cv2dprimez=(bedside$cv2dprime - 3.1658) /.72185
bedside$bensonrecallz=(bedside$bensonrecall - 11.1576) /3.14529
bedside$nmmemory <- apply(!is.na(bedside[,m]), 1, sum)
## Memory composite
bedside$msumz <- ifelse(bedside$nmmemory>=3,rowSums(bedside[,m], na.rm=TRUE),NA)
bedside$mngood <- as.numeric(bedside$nmmemory)
bedside$memoryzscore=bedside$msumz /bedside$mngood
## Bedside Executive score
e <- c("DigitBWz", "StrpCorz", "MTTimez", "dcorrz", "DFCorrz")
bedside$DigitBWz <- (bedside$digit_bw - 5.34) /1.262
bedside$StrpCorz <- (bedside$strp_cor - 48.53) /10.991
bedside$MTTimez <- (32.18 - bedside$mt_time) /16.618
bedside$dcorrz <- (bedside$d_corr - 15.7128) /4.55404
bedside$DFCorrz <- (bedside$df_corr - 10.51) /3.195
bedside$nmbsex <- apply(!is.na(bedside[,e]), 1, sum)
## Bedside Executive composite
bedside$esumz <- ifelse(bedside$nmbsex>=3,rowSums(bedside[,e], na.rm=TRUE),NA)
bedside$engood <- as.numeric(bedside$nmbsex)
bedside$bsexzscore <- bedside$esumz /bedside$engood
## round all vars to two decimal places for bedside
bedside%>%
select(-engood, -nmmemory, -nmbsex,-mngood, -msumz, -esumz)%>%
mutate(across(c(immrecall, bensonrecall, immrecallz,
cv2fptot, cv2phit, cv2pfp, cv2zhit, cv2zfp, cv2dprime, cv2bias, cv2dprimez,
bensonrecallz, memoryzscore, DigitBWz, StrpCorz, MTTimez, dcorrz, DFCorrz,
bsexzscore, mt_ratio, mt_ln), janitor::round_half_up, 3)) ->
bedside
## round all vars to two decimal places for infoprocspeed
lava$infoprocessingspeed%>%
select(-nmverbal, -nmspatial, -vngood, -sngood, -vsumz, -ssumz)%>%
mutate(across(c(animyesz, animnoz, line10z, line20z, rhymeyesz, rhymenoz, rotate60z, rotate120z, search16nz, search16yz, search24nz,
search24yz, wordyesz, wordnoz, match21z, match22z, match23z, rotatez, pronz, animz, rhymez, wordz, dotz,
match1z, shapez, match2z, linez, searchz, verbal, spatial), janitor::round_half_up, 3))%>%
select(PIDN, DCDate, instr_type, task, version, animyesz, animnoz, line10z,
line20z, rhymeyesz, rhymenoz, rotate60z, rotate120z, search16nz, search16yz,
search24nz, search24yz, wordyesz, wordnoz, match21z, match22z, match23z,
rotatez, pronz, animz, rhymez, wordz, dotz, match1z, shapez, match2z, linez,
searchz, verbal, spatial) ->
infoprocessingspeed
## round all vars to two decimal places for 1 Back
lava$`1back`%>%
mutate(across(c(nb1Hits, nb1TotalNo, nb1FalseAlarms, nb1ZFA, nb1ZHIT, nb1dprime), janitor::round_half_up, 3))%>%
select(PIDN, DCDate, instr_type, task, version, nb1Hits, nb1TotalNo, nb1FalseAlarms, nb1ZFA, nb1ZHIT, nb1dprime) ->
`1back`
## round all vars to two decimal places for 2 Back
lava$`2back`%>%
mutate(across(c(nb2Hits, nb2TotalNo, nb2FalseAlarms, nb2ZFA , nb2ZHIT, nb2dprime, nb2bias), janitor::round_half_up, 3))%>%
select(PIDN, DCDate, instr_type, task, version, nb2Hits, nb2TotalNo, nb2FalseAlarms, nb2ZFA , nb2ZHIT, nb2dprime, nb2bias) ->
`2back`
## round all vars to two decimal places for Flanker eprime
lava$enclosedflanker%>%
mutate(across(c(flkincacc, flkincaccscore, flkinclog, flkincstem, flkincrtscore, flankerinc), janitor::round_half_up, 3))%>%
select(PIDN, DCDate, instr_type, task, version, flkincacc, flkincaccscore, flkinclog, flkincstem, flkincrtscore, flankerinc) ->
enclosedflanker
## round all vars to two decimal places for SetShifting eprime
lava$setshifting%>%
mutate(across(c(shiftacc, shiftaccscore, shftlog, shftstem, shiftrtscore, shiftscore), janitor::round_half_up, 3))%>%
select(PIDN, DCDate, instr_type, platform, shiftacc, shiftaccscore, shftlog, shftstem, shiftrtscore, shiftscore) ->
setshifting
bedside%>%
mutate(gds_depression = sum(gds16, gds9, gds3, gds1, gds23, gds7, gds4, gds25, gds15),
gds_wav = sum(gds12, gds28, gds2, gds19, gds20, gds21),
gds_anxiety = sum(gds8, gds13, gds6, gds18),
gds_cognitive = sum(gds14, gds26, gds30, gds29),
gds_hopelessness = sum(gds22, gds10, gds17, gds5),
gds_agitation = sum(gds11, gds24, gds27)) ->
bedside
# curate the dataset to pass into code from scoreUDS_mirt_toShare.R
data_raw <- bind_rows(
lava$udsneuropsych,
lava$udsneuropsychtcog,
lava$udsneuropsychmoca)%>%
group_by(PIDN,DCDate)%>%
arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
distinct(PIDN, DCDate, .keep_all = TRUE)%>%
right_join(lava$udssubjectdemo%>%
select(PIDN,DCDate,educ,sex), by = c('PIDN','DCDate'))%>%
mutate('X' = sex-1)%>%
select(PIDN, educ, age_at_dc, X, sex, animals, veg, udsverfc, udsverlc, digbacct, traila, trailali, trailb, trailbli)%>%
mutate(across(c(educ ,sex, animals:digbacct, trailali, trailbli),
~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, -10, 88, 95, 96, 97, 98, 99, 888, 8888, 995, 996, 997, 998), NA, .)))%>%
mutate(across(c(traila, trailb), ~ifelse(. %in% c(-9, -8, -7, -6, 995, 996, 997, 998), NA, .)))%>%
group_by(PIDN)%>%
fill(educ, .direction = 'down')%>%
ungroup()%>%
rename(naccage = age_at_dc)%>%
select(-sex)%>%
rename(sex = X)%>%
as.data.frame()
### pulled from scoreUDS_mirt_toShare.R --- NO NEED TO run that script if running this
# Note. Please make sure that your sex variable (named SEX) has been recoded such that male = 0, female = 1
################################# NO CHANGES NEEDED PAST THIS POINT ##############################################
#### Clean your data ####
names(data_raw) <- tolower(names(data_raw))
data_raw$animals[data_raw$animals >70] <- NA
data_raw$veg[data_raw$veg >70] <- NA
data_raw$udsverfc[data_raw$udsverfc >70]<- NA
data_raw$udsverlc[data_raw$udsverlc >70]<- NA
data_raw$traila[data_raw$traila >400]<- NA
data_raw$trailb[data_raw$trailb >400]<- NA
data_raw$digbacct <- as.integer(data_raw$digbacct)
data_raw$digbacct[data_raw$digbacct >70]<- NA
data_raw$trailali[data_raw$trailali >70]<- NA
data_raw$trailbli[data_raw$trailbli >70]<- NA
data_raw$trailA_ratio <- (data_raw$trailali*60)/data_raw$traila
data_raw$trailB_ratio <- (data_raw$trailbli*60)/data_raw$trailb
#### End clean ####
#### Recode continuous variables to ordinal ####
# Recode script was written by Dan Mungas
var_OG <- c("animals","veg","udsverfc","udsverlc","digbacct", "trailA_ratio", "trailB_ratio")
var_Recode <- lapply(var_OG, paste0, "_r") # add suffix for transformed var names.
var_Recode <-unlist(var_Recode,use.names = FALSE)
recodeLookup <- function(df,varlist_orig, varlist_tr, type="continuous", lookup=NULL, ### should this be ordinal.
lu_type="data") {
if (is.data.frame(df)){
rcd <- df
} else {
rcd <- eval(parse(text=df))
}
if (is.null(lookup)){
rcdlu <- rcd[,c(varlist_orig,varlist_tr)]
} else{
if (is.data.frame(lookup)) {
if (lu_type == "lookup") {
rcdlu <- lookup
} else {
rcdlu <- lookup[,c(varlist_orig,varlist_tr)]
}
} else {
if (lu_type == "lookup") {
rcdlu <- eval(parse(text=lookup))
} else {
rcdlu <- eval(parse(text=lookup))
rcdlu <- rcdlu[,c(varlist_orig,varlist_tr)]
}
}
}
for (j in 1:length(varlist_orig)){
if (lu_type == "lookup") {
luv <- rcdlu[,c("recode_score", paste0("min_", varlist_orig[j]), paste0("max_",varlist_orig[j]))]
names(luv) <- sub("recode_score","tr_min",names(luv))
luv$tr_max <- luv$tr_min
names(luv) <- c("tr_min","orig_min","orig_max","tr_max")
luv <- luv[!is.na(luv$orig_min),]
for (i in 1:nrow(luv)) {
if (i < nrow(luv)) {
luv[i,"orig_max"] <- luv[i+1,"orig_min"]
}
}
} else {
t5 <- unique(rcdlu[!is.na(rcdlu[,varlist_tr[j]]),c(varlist_orig[j],varlist_tr[j])])
t5 <- t5[order(t5[varlist_orig[j]]),]
luv <- as.data.frame(cbind(t5,rbind(t5[2:nrow(t5),],c(NA,NA))))
colnames(luv) <- c("orig_min","tr_min","orig_max","tr_max")
}
mino <- min(luv[,"orig_min"], na.rm=TRUE)
mint <- min(luv[,"tr_min"], na.rm=TRUE)
maxo <- max(luv[,"orig_max"], na.rm=TRUE)
maxt <- max(luv[,"tr_max"], na.rm=TRUE)
rcd[,varlist_tr[j]] <- ifelse(rcd[,varlist_orig[j]] <= mino,mint,NA)
rcd[,varlist_tr[j]] <- ifelse(rcd[,varlist_orig[j]] >= maxo,maxt,rcd[,varlist_tr[j]])
t3 <- rcd[,c(varlist_orig[j],varlist_tr[j])]
if (lu_type == "lookup") {
sqlcd <- paste("SELECT * FROM t3 AS t3
LEFT JOIN
(SELECT * FROM luv)
AS luv1 ON t3.",varlist_orig[j],
" >= luv1.orig_min AND t3.", varlist_orig[j],
" < luv1.orig_max",sep="")
} else {
sqlcd <- paste("SELECT * FROM t3 AS t3
LEFT JOIN
(SELECT * FROM luv)
AS luv1 ON t3.",varlist_orig[j],
" >= luv1.orig_min AND t3.", varlist_orig[j],
" < luv1.orig_max",sep="")
}
t4 <- sqldf::sqldf(sqlcd)
if (type == "continuous") {
t4[,varlist_tr[j]] <- ifelse(!is.na(t4[,varlist_tr[j]]),t4[,varlist_tr[j]],
ifelse(t4[,varlist_orig[j]] >= maxo,maxt,
( ( (t4[,varlist_orig[j]] - t4$orig_min) / (t4$orig_max - t4$orig_min) ) *
(t4$tr_max - t4$tr_min) ) + t4$tr_min))
} else {
t4[,varlist_tr[j]] <- ifelse(!is.na(t4[,varlist_tr[j]]),t4[,varlist_tr[j]],
ifelse(t4[,varlist_orig[j]] >= max(t4$orig_max,na.rm=TRUE),max(t4$tr_max,na.rm=TRUE),
t4$tr_min))
}
rcd[,varlist_tr[j]] <- t4[,varlist_tr[j]]
}
return(rcd)
}
luv1 <- read.table(paste(repofolder,"score_uds_folder/UDS3_EF_Lookup_4.21.20_share.csv", sep = ''), header=TRUE, sep=",")
recoded_dat <- recodeLookup(data_raw, var_OG, var_Recode, type="ordinal", lookup=luv1, lu_type="lookup" )
load(paste(repofolder,"score_uds_folder/nacc_fit_MIRT_share.Rdata", sep =''))
countMiss <- function(x) {sum(is.na(x))}
recoded_dat$rowid <- 1:nrow(recoded_dat)
recoded_dat_VarsOnly <- recoded_dat[,var_Recode]
recoded_dat_VarsOnly$rowid <- 1:nrow(recoded_dat_VarsOnly)
recoded_dat_VarsOnly$n_miss <- apply(recoded_dat_VarsOnly[,var_Recode],1, countMiss)
# extracts data frame with records that have non-missing items
recode_cal <- recoded_dat_VarsOnly[recoded_dat_VarsOnly$n_miss < length(var_Recode),]
fsc <- data.frame(mirt::fscores(uds_res_results, response.pattern = recode_cal[,var_Recode],
method = "EAP" , QMC=TRUE,full.scores.SE=T,na.rm=F))
fsc$rowid <- recode_cal$rowid
names(fsc) <- sub("F","uds3_ef",names(fsc))
vars2merge<- c("rowid","uds3_ef","SE_uds3_ef")
final_dat <- plyr::join(recoded_dat, fsc[,vars2merge], by = c("rowid"), type ="left")
#### End Recode ####
### Demographic Corrections ####
# Important Disclaimers:
#1. Please note that these scores must be interpretted with extreme caution if the demographics of your sample
#differ from the demographics of the NACC normative dataset, which is highly white (X%), well educated, and English speaking
#Validation work is underway for individuals with different demographic variables.
#2. If using with longitudinal data, consider creating a baseline age variable for each person. By using time-varying age,
#there is the potential that an individuals longitudinal performance is altered due to changing age at each timepoint, which might
# not be desirable for all studies.
#3. Adjusted scores will only be calculated for a row of data if all three demographic variables are available
# for sex, recode so that male = 0, female = 1
#### Demographic Corrections ####
# Varset should stay the same - these are the variable names from the normative dataset
varset <- c("uds3_ef")
#varnames refers the the variable names in the dataset on which we are calculating the z-scores.
varnames <- c("uds3_ef")
datfull_1 <- as.data.frame(final_dat)
## create new variables from you demographics with the following names (EDUC, NACCAGE, and SEX)
datfull_1$EDUC <- datfull_1$educ
datfull_1$NACCAGE <- as.integer(datfull_1$naccage)
datfull_1$SEX <- as.integer(datfull_1$sex)
## this removes rows that are missing the needed variables.
datfull_1$rowid <- 1:nrow(datfull_1)
datfull_2 <- datfull_1 %>%
filter_at(vars("uds3_ef", "EDUC", "NACCAGE", "SEX"), all_vars(!is.na(.)))
## Bringing in extreme predictor values
for(i in 1:nrow(datfull_2)){
if(datfull_2$NACCAGE[i] < 49.5) datfull_2$NACCAGE[i] <- 50
if(datfull_2$NACCAGE[i] > 90.5) datfull_2$NACCAGE[i] <- 90
if(datfull_2$EDUC[i] < 9.5) datfull_2$EDUC[i] <- 10
if(datfull_2$EDUC[i] > 20.5) datfull_2$EDUC[i] <- 20
}
zscore <- function(x,xbar,s){
(x-xbar)/s
}
for (i in 1:length(varset)){
lookup <- read.csv(paste(repofolder,'score_uds_folder/LookUps/',varset[i],"_lookup.csv", sep=""))
label1 <- as.character(paste(varset[i],"_mean.adj", sep=""))
lookup[label1] <- lookup$mean.adj ### Need to use brackets to assign a character as a variable name.
label2 <- paste(varset[i],"_sd.adj", sep="")
lookup[label2] <- lookup$sd.adj
lookup$sd.adj <- NULL
lookup$mean.adj <- NULL
label3 <- as.character(varnames[i])
merged_data <- plyr::join(datfull_2, lookup, by = c("NACCAGE", "EDUC","SEX"), type ="left")
datfull_2[label1] <- merged_data[label1]
datfull_2[label2] <- merged_data[label2]
zscore_label <- as.character(paste(varset[i],"_Z", sep=""))
datfull_2[zscore_label] <- zscore(datfull_2[label3], datfull_2[label1], datfull_2[label2] )
datfull_2[zscore_label][is.na(datfull_2[label3])] <- "NA"
}
vars2merge_demo<- c("rowid", "uds3_ef_mean.adj", "uds3_ef_sd.adj", "uds3_ef_Z")
#bv_data_UDS_recoded <- as.data.frame(bv_data_UDS_recoded)
final_dat_demo <- plyr::join(datfull_1, datfull_2[,vars2merge_demo], by = c("rowid"), type ="left")
# convert to tibble and rename columns
final_dat_demo<-final_dat_demo%>%as_tibble()%>%rename(DCDate = dcdate, PIDN = pidn)
#saving output of scoring from code from scoreUDS_mirt_toShare.R to the curated dataset requested
bind_rows(
lava$udsneuropsych,
lava$udsneuropsychtcog,
lava$udsneuropsychmoca)%>%
select(PIDN, DCDate, instr_type, form_ver, v_type, traila, trailarr, trailali, trailb, trailbrr, trailbli, wais,
memunits, pentagon, craftvrs, crafturs, udsbentc, digforct, craftdvr, craftdre,
udsbentd, udsbenrs, minttots, udsvertn, cogstat, logimem:boston, mocatots, mmse_zscore:boston_zscore)%>%
mutate(across(c(traila:mocatots),
~ifelse(. %in% c(-1, -2, -5, -6, -7, -8, -9, 995, 996, 997, 998), NA, .)))%>%
mutate(across(c(trailarr, trailali, trailbrr, trailbli, memunits:mocatots),
~ifelse(. %in% c(88, 95, 96, 97, 98, 99), NA, .)))%>%
mutate(digib = na_if(digib, 70))%>%
group_by(PIDN,DCDate)%>%
arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
distinct(PIDN,DCDate, .keep_all = TRUE)%>%
left_join(final_dat_demo%>%
as_tibble())%>%
select(-ends_with('_r'))%>%
relocate(uds3_ef,
SE_uds3_ef,
uds3_ef_Z,
uds3_ef_mean.adj,
uds3_ef_sd.adj,
digiflen_zscore,
digiblen_zscore,
mmse_zscore:boston_zscore,
.after = v_type)%>%
select(-c(instr_type, form_ver, EDUC, NACCAGE, SEX, educ, naccage, sex, rowid)) ->
uds_neuropsych
All dfs here will be joined to timepoints
curated_dfs_named <- lst(
#beside data and composites
bedside,
#uds_neuorpsych
uds_neuropsych,
## infoprocspeed
infoprocessingspeed,
##1 Back
`1back`,
## 2 Back
`2back`,
## Flanker eprime
enclosedflanker,
## SetShifting eprime
setshifting,
## neuroexam
adrcneuroexam,
## uds health history
health_history,
## uds medical conditions
med_conditions,
## hbudsphysical
physical,
## uds medications prescription
medications,
## uds otc meds
otc_meds,
## uds vitamins data
supplements,
## Need to calc composite scoring for this
brainhealthassessment,
## cdr
cdr,
## champs
champs,
## CAS -- scored
cogntive_activity_scale,
## cpt
cpt,
## synaptic_markers
csf,
## diagnosis
diagnosis,
## early dev history
earlydevhistory,
## everydaycogself
everydaycogself,
## faq
faq,
## fisherman story
fishermanstory,
## fitbit data
fitbit,
## GAD
gad,
## GAIT
gait,
## Grit
grit,
## hbclinicallabs_myelinucddra
clinical_labs,
## pet_data
pet,
## MRI data
# schaefer_imaging = imaging_ip$schaefer_imaging,
imaging_wmh,
imaging_pasl,
imaging_pcasl,
# diffusion_tensor_imaging = imaging_ip$imaging_dti_joined,
# cerebral_blood_flow_pulsed_ASL = imaging_ip$pasl_combined,
# `cerebral_blood_flow_pseudo-continuous_ASL` = imaging_ip$pcasl_combined,
# imaging_gm_all_production = imaging_ip$`mri_box_data_modified_clean_names$imaging_gm_all_production`,
# imaging_cbf_pvc_gm = imaging_ip$`mri_box_data_modified_clean_names$imaging_cbf_pvc_gm`,
# imaging_t1 = imaging_ip$`old_imaging$imaging_t1`,
## MAAS
maas_mindfulness,
## mesoscale
mesoscale,
## mind diet
diet,
## npi -- joining the two npi instruments
npi,
## PASE
pase,
## pattern separation
patternseparation,
## psqi
psqi,
#PSS
pss,
## berlin sleep
berlin_sleep,
## edinburgh_handedness_inventory
edinburgh_handedness,
## epworth sleepiness scale
epworth_sleepiness,
## insomnia severity
insomnia_severity_index,
## other sleep instruments not collected by hillblom
sleep_instruments,
## sleep apnea
apnea,
## sleep profiler
sleep_profiler,
## sex and reproduction
sex_and_reproductive_health,
## sni--only have scoring for some of the varibles included previously
social_network_index,
## quanterix
quanterix,
## quanterix_fitbit
quanterix_fitbit,
## bu_subset_data
bu_subset_data,
## osu_tbi
osu_tbi_locpta_subset_data,
## tabcat_data
tabcat_bha,
tabcat_animal_fluency,
## consolidated dot counting data
dot_counting,
## tabcat_data
tabcat_flanker,
tabcat_ll,
tabcat_lo,
tabcat_match,
tabcat_rapid_naming,
tabcat_running_dots,
tabcat_set_shifting,
tabcat_fav,
## virtual_bedside (from qualtrics)
virtual_bedside)
# combines dfs with same name
combiner(curated_dfs_named) ->
curated_dfs_named
# keeps duplicated data with least NA values
map(curated_dfs_named, clean_instrument) -> curated_dfs_named
#get all dfs
list_of_dfs <- c(if(exists('timepoints_dfs')) pluck(timepoints_dfs), if(exists('original_data_no_tps')) pluck(original_data_no_tps))
#strip 'original_data' from name. might not be needed since we set names later on
names(list_of_dfs) <- str_replace(names(list_of_dfs), "^original_data_", "")
# set attributes for datasets
attr(list_of_dfs$fitbit, "dataset_name") <- 'fitbit_dataset'
attr(list_of_dfs$BrANCH, "dataset_name") <- 'BrANCH_dataset'
attr(list_of_dfs$specimens, "dataset_name") <- 'specimens_dataset'
attr(list_of_dfs$sleep, "dataset_name") <- 'sleep_dataset'
attr(list_of_dfs$fitbit, 'df_suffix') <- 'fitbit'
attr(list_of_dfs$specimens, 'df_suffix') <- 'specimens'
attr(list_of_dfs$sleep, 'df_suffix') <- 'sleep'
attr(list_of_dfs$fitbit, "run_chunk_name_in_curated_dfs") <- 1
attr(list_of_dfs$specimens, 'run_chunk_name_in_curated_dfs') <- 2
attr(list_of_dfs$sleep, 'run_chunk_name_in_curated_dfs') <- 2
curated_dfs_named%>%
keep(~any(names(.x) == 'PIDN'))%>%
map(~.x%>%mutate(PIDN = as.double(PIDN))) -> curated_dfs_named
datasets <- list()
datasets <- flatten(map(names(list_of_dfs), function(df_name) {
# Access the dataframe using its name
anchor_df <- list_of_dfs[[df_name]]
#reset suffix
df_suffix <- ""
anchor_data <- anchor_df
# Check if the dataframe has columns other than PIDN and DCDate
if (ncol(ungroup(anchor_df)) > 2 & is.null(attr(anchor_df, 'df_suffix'))) {
# Ask the user to enter a suffix for column names
df_suffix <- readline(paste0("Enter suffix to append to column names in '", df_name,
"' if linking instrument data along with the timepoints:"))
anchor_data <- anchor_df %>%
rename_with(
~ paste(., df_suffix, sep = '.'),
-c('PIDN', 'DCDate'))
if (is.null(attr(anchor_df, "run_chunk_name_in_curated_dfs"))) {
# Use the name of the dataframe in the title of the menu
run_chunk_name_in_curated_dfs <- utils::menu(
choices = c("Yes", "No"),
title = paste0("Is data from '", df_name, "' already in curated_dfs_named?", sep = ' '))
} else {
run_chunk_name_in_curated_dfs <- attr(anchor_df, "run_chunk_name_in_curated_dfs")
}
if (run_chunk_name_in_curated_dfs == 1) {
unmodified_curated_dfs <- curated_dfs_named
curated_dfs_named <- discard(curated_dfs_named, names(curated_dfs_named) %in% df_suffix)
}
} else if (ncol(ungroup(anchor_df)) > 2 & !is.null(attr(anchor_df, 'df_suffix'))) {
df_suffix <- as.character(attr(anchor_df, 'df_suffix'))
anchor_data <- anchor_df %>%
rename_with(
~ paste(., df_suffix, sep = '.'),
-c('PIDN', 'DCDate'))
if (is.null(attr(anchor_df, "run_chunk_name_in_curated_dfs"))) {
# Use the name of the dataframe in the title of the menu
run_chunk_name_in_curated_dfs <- utils::menu(
choices = c("Yes", "No"),
title = paste0("Is data from '", df_name, "' already in curated_dfs_named?", sep = ' '))
} else {
run_chunk_name_in_curated_dfs <- attr(anchor_df, "run_chunk_name_in_curated_dfs")
}
if (run_chunk_name_in_curated_dfs == 1) {
unmodified_curated_dfs <- curated_dfs_named
curated_dfs_named <- discard(curated_dfs_named, names(curated_dfs_named) %in% df_suffix)
}
}
vasc_burden <- get_vasc_burden(select(anchor_df, PIDN, DCDate))
joined_data <- join_list_of_dfs_to_timepoints(select(anchor_df, PIDN, DCDate), curated_dfs_named%>%append(lst(vasc_burden)), 365)
dataset_joined <- left_join(anchor_data, joined_data, by = c('PIDN', 'DCDate'))
dataset_joined_no_date_data <- dataset_joined%>%
left_join(curated_dfs_named%>%
keep(~any(names(.x) == 'PIDN', na.rm = T))%>%
discard(~any(names(.x) == 'DCDate', na.rm = T))%>%
#renaming variables in list of dataframes to include .df after the name--keeps track of where data came from
imap(function(x, y){
x %>% rename_with(~paste(.,y, sep = '.'), -c('PIDN'))})%>%
reduce(., full_join, by = 'PIDN'), by = 'PIDN')
# Request user to set variable name
if (is.null(attr(anchor_df, "dataset_name"))) {
dataset_name <- readline(paste0("Please name the ", df_name, " dataset:", sep = " "))
} else {
dataset_name <- attr(anchor_df, "dataset_name")
}
# Define variable with user-specified name and assign that name to the final df
assign(dataset_name,
#joining RFM and demographics to the front of the master df.
dataset_joined_no_date_data%>%
filter(any(!is.na(DCDate)))%>%
drop_na(DCDate)%>%
left_join(demographics_curated_df%>%
rename_with(~paste(.,'demographics',sep = '.'),-c('PIDN')), by = 'PIDN')%>%
mutate(dob.demographics = as_date(dob.demographics),
age_at_DCDate = eeptools::age_calc(dob.demographics, DCDate, units = 'years', precise = FALSE))%>%
select(-dob.demographics)%>%
relocate(matches('.demographics'), .after = ncol(anchor_data)) %>% # Move this line up
left_join(diagnosis_latest%>%
rename_with(~paste(., 'diagnosis_latest', sep = '.'), -c('PIDN')), by = 'PIDN')%>%
relocate(matches('.diagnosis_latest'), .after = matches("res_dx_b.diagnosis"))%>%
left_join(genetics_data%>%
rename_with(~paste(.,'genetics',sep = '.'), -c('PIDN')), by = 'PIDN')%>%
relocate(PIDN : DCDate, age_at_DCDate)%>%
mutate(UnQID = as.numeric(str_c(PIDN,as.numeric(as_date(DCDate, format = '%m/%d/%Y') - as_date('1899-12-30', format = '%Y-%m-%d')))))%>%
relocate(UnQID, DCDate, age_at_DCDate, .after = PIDN)%>%
remove_empty(c("rows", "cols")))
#remove unmodified_curated_dfs if it was created and set curated_dfs back to itself
if(exists('unmodified_curated_dfs')) {
curated_dfs_named <- unmodified_curated_dfs
remove(unmodified_curated_dfs)
}
set_names(lst(get(dataset_name)), dataset_name)
}))
filename <- str_c(datafolder, "datasets_", today(), ".RDS")
if (!file.exists(filename)) {
saveRDS(datasets, filename)
}
Social Network Index (SNI)